1 /**
2 *
3 * Copyright 2016-2020 Netflix, Inc.
4 *
5 * Licensed under the BSD+Patent License (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * https://opensource.org/licenses/BSDplusPatent
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19 #include <errno.h>
20 #include <math.h>
21 #include <stdbool.h>
22 #include <stddef.h>
23 #include <string.h>
24
25 #include "feature/integer_motion.h"
26 #include "feature/common/alignment.h"
27
28 #include <immintrin.h>
29
x_convolution_16_avx512(const uint16_t * src,uint16_t * dst,unsigned width,unsigned height,ptrdiff_t src_stride,ptrdiff_t dst_stride)30 void x_convolution_16_avx512(const uint16_t *src, uint16_t *dst, unsigned width,
31 unsigned height, ptrdiff_t src_stride,
32 ptrdiff_t dst_stride)
33 {
34 const unsigned radius = filter_width / 2;
35 const unsigned left_edge = vmaf_ceiln(radius, 1);
36 const unsigned right_edge = vmaf_floorn(width - (filter_width - radius), 1);
37 const unsigned shift_add_round = 32768;
38 const unsigned vector_loop = (width>>5) -1;
39 uint16_t *tmpdst = dst;
40 uint16_t *src_p = (uint16_t*) src + (left_edge - radius);
41 unsigned nr = left_edge + 32 *vector_loop;
42 uint16_t *src_pt = (uint16_t*) src + nr -radius;
43 for (unsigned i = 0; i < height; ++i) {
44 for (unsigned j = 0; j < left_edge; j++) {
45 dst[i * dst_stride + j] =
46 (edge_16(true, src, width, height, src_stride, i, j) +
47 shift_add_round) >> 16;
48 }
49
50 }
51
52 for (unsigned i = 0; i < height; ++i) {
53 uint16_t *src_p1 = src_p;
54 for (unsigned j = 0; j <vector_loop; j=j+1) {
55 __m512i src1 = _mm512_loadu_si512 ((__m512i *)src_p1);
56 __m512i kernel1 = _mm512_set1_epi16( 3571);
57 __m512i kernel2 = _mm512_set1_epi16( 16004);
58 __m512i kernel3 = _mm512_set1_epi16( 26386) ;
59 __m512i result = _mm512_mulhi_epu16(src1,kernel1);
60 __m512i resultlo = _mm512_mullo_epi16(src1,kernel1);
61
62 // src1= src1>>16; //shift by a pixel
63 __m512i src2 = _mm512_loadu_si512 ((__m512i *)(src_p1+1));
64 __m512i result2 = _mm512_mulhi_epu16(src2,kernel2);
65 __m512i result2lo = _mm512_mullo_epi16(src2,kernel2);
66 __m512i accum1_lo = _mm512_unpacklo_epi16(resultlo, result);
67 __m512i accum1_hi = _mm512_unpackhi_epi16(resultlo, result);
68 __m512i accum2_lo = _mm512_unpacklo_epi16(result2lo, result2);
69 __m512i accum2_hi = _mm512_unpackhi_epi16(result2lo, result2);
70
71 // Filter[3] value
72 // src1= src1>>32;
73 __m512i src3 = _mm512_loadu_si512 ((__m512i *)(src_p1+2));
74 __m512i result3 = _mm512_mulhi_epu16(src3,kernel3);
75 __m512i result3lo = _mm512_mullo_epi16(src3,kernel3);
76 __m512i accum3_lo = _mm512_unpacklo_epi16 (result3lo, result3);
77 __m512i accum3_hi = _mm512_unpackhi_epi16 (result3lo, result3);
78 //filter 4
79 src1 = _mm512_loadu_si512 ((__m512i *)(src_p1+3));
80 result = _mm512_mulhi_epu16(src1,kernel2);
81 resultlo = _mm512_mullo_epi16(src1,kernel2);
82
83 //Filter 5
84 src2 = _mm512_loadu_si512((__m512i *)(src_p1+4));
85 result2 = _mm512_mulhi_epu16(src2,kernel1);
86 result2lo = _mm512_mullo_epi16(src2,kernel1);
87
88 __m512i accum4_lo =_mm512_unpacklo_epi16(resultlo, result);
89 __m512i accum4_hi =_mm512_unpackhi_epi16(resultlo, result);
90 __m512i accum5_lo =_mm512_unpacklo_epi16(result2lo, result2);
91 __m512i accum5_hi =_mm512_unpackhi_epi16(result2lo, result2);
92
93 __m512i addnum = _mm512_set1_epi32(32768);
94 __m512i accum_lo = _mm512_add_epi32(accum1_lo,accum2_lo);
95 __m512i accumi_lo = _mm512_add_epi32(accum3_lo,accum4_lo);
96 accum5_lo = _mm512_add_epi32(accum5_lo,addnum);
97 accum_lo = _mm512_add_epi32(accum5_lo,accum_lo);
98 accum_lo = _mm512_add_epi32(accumi_lo,accum_lo);
99 __m512i accum_hi = _mm512_add_epi32(accum1_hi,accum2_hi);
100 __m512i accumi_hi = _mm512_add_epi32(accum3_hi,accum4_hi);
101 accum_hi = _mm512_add_epi32(accum5_hi,accum_hi);
102 accumi_hi = _mm512_add_epi32(accumi_hi,addnum);
103 accum_hi = _mm512_add_epi32(accumi_hi,accum_hi);
104 accum_lo = _mm512_srli_epi32(accum_lo, 0x10);
105 accum_hi = _mm512_srli_epi32(accum_hi, 0x10);
106
107 result = _mm512_packus_epi32(accum_lo,accum_hi);
108 _mm512_storeu_si512((__m512i *) (dst+ i * dst_stride + j*32+ left_edge),result);
109 src_p1+=32;
110 }
111
112 src_p += src_stride;
113 }
114
115 for (unsigned i = 0; i < height; ++i) {
116 uint16_t *src_p1 = src_pt;
117 for (unsigned j = nr; j < (right_edge); j++) {
118 uint32_t accum = 0;
119 uint16_t *src_p2 = src_p1;
120 for (unsigned k = 0; k < filter_width; ++k) {
121 accum += filter[k] * (*src_p2);
122 src_p2++;
123 }
124 src_p1++;
125 dst[i * dst_stride + j] = (accum + shift_add_round) >> 16;
126 }
127 src_pt += src_stride;
128 }
129
130 for (unsigned i = 0; i < height; ++i) {
131 for (unsigned j = right_edge; j < width; j++) {
132 dst[i * dst_stride + j] =
133 (edge_16(true, src, width, height, src_stride, i, j) +
134 shift_add_round) >> 16;
135 }
136 }
137 }
138