1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13 #include <smmintrin.h>
14
15 #include "config/av1_rtcd.h"
16
17 #include "av1/common/warped_motion.h"
18
19 /* This is a modified version of 'warped_filter' from warped_motion.c:
20 * Each coefficient is stored in 8 bits instead of 16 bits
21 * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
22
23 This is done in order to avoid overflow: Since the tap with the largest
24 coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
25 order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
26 convolve functions.
27
28 Instead, we use the summation order
29 ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
30 The rearrangement of coefficients in this table is so that we can get the
31 coefficients into the correct order more quickly.
32 */
33 /* clang-format off */
34 DECLARE_ALIGNED(8, static const int8_t,
35 filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
36 #if WARPEDPIXEL_PREC_BITS == 6
37 // [-1, 0)
38 { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
39 { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
40 { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
41 { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
42 { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
43 { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
44 { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
45 { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
46 { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
47 { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
48 { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
49 { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
50 { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
51 { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
52 { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
53 { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
54 { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
55 { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
56 { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
57 { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
58 { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
59 { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
60 { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
61 { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
62 { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
63 { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
64 { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
65 { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
66 { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
67 { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
68 { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
69 { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
70 // [0, 1)
71 { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
72 { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
73 { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
74 {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
75 {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
76 {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
77 {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
78 {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
79 {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
80 {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
81 {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
82 {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
83 {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
84 {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
85 {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
86 {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
87 {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
88 {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
89 {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
90 {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
91 {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
92 {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
93 {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
94 {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
95 {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
96 {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
97 {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
98 {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
99 {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
100 {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
101 { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
102 { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
103 // [1, 2)
104 { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
105 { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
106 { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
107 { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
108 { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
109 { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
110 { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
111 { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
112 { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
113 { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
114 { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
115 { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
116 { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
117 { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
118 { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
119 { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
120 { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
121 { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
122 { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
123 { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
124 { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
125 { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
126 { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
127 { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
128 { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
129 { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
130 { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
131 { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
132 { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
133 { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
134 { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
135 { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
136 // dummy (replicate row index 191)
137 { 0, 0, 2, -1, 0, 0, 127, 0},
138
139 #else
140 // [-1, 0)
141 { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0},
142 { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0},
143 { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0},
144 { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0},
145 { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0},
146 { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0},
147 { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0},
148 { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0},
149 { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0},
150 { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0},
151 { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0},
152 { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0},
153 { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0},
154 { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0},
155 { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0},
156 { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0},
157 // [0, 1)
158 { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0},
159 { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1},
160 {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1},
161 {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1},
162 {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2},
163 {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2},
164 {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2},
165 {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2},
166 {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2},
167 {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2},
168 {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2},
169 {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2},
170 {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2},
171 {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1},
172 {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1},
173 { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0},
174 // [1, 2)
175 { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0},
176 { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1},
177 { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2},
178 { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3},
179 { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3},
180 { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3},
181 { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4},
182 { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4},
183 { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4},
184 { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4},
185 { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4},
186 { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3},
187 { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3},
188 { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2},
189 { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1},
190 { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1},
191 // dummy (replicate row index 95)
192 { 0, 0, 4, -3, 0, -1, 127, 1},
193 #endif // WARPEDPIXEL_PREC_BITS == 6
194 };
195 /* clang-format on */
196
197 // Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
198 // in an SSE register into two sequences:
199 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
200 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
201 static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8,
202 8, 10, 10, 12, 12, 14, 14, 0 };
203 static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9,
204 9, 11, 11, 13, 13, 15, 15, 0 };
205
206 static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
207 0, 1, 0, 1, 0, 1, 0, 1 };
208
209 static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
210 2, 3, 2, 3, 2, 3, 2, 3 };
211
212 static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
213 4, 5, 4, 5, 4, 5, 4, 5 };
214
215 static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
216 6, 7, 6, 7, 6, 7, 6, 7 };
217
218 static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
219 0, 1, 2, 3, 0, 1, 2, 3 };
220 static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
221 4, 5, 6, 7, 4, 5, 6, 7 };
222 static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
223 8, 9, 10, 11, 8, 9, 10, 11 };
224 static const uint8_t shuffle_gamma0_mask3[16] = {
225 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
226 };
227
filter_src_pixels(__m128i src,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)228 static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
229 const int offset_bits_horiz,
230 const int reduce_bits_horiz, int k) {
231 const __m128i src_even =
232 _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
233 const __m128i src_odd =
234 _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
235 // The pixel order we need for 'src' is:
236 // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
237 const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
238 const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
239 // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
240 const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
241 _mm_srli_si128(src_odd, 4));
242 const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
243 // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
244 const __m128i src_13 =
245 _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
246 const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
247 // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
248 const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
249 _mm_srli_si128(src_even, 6));
250 const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
251
252 const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
253 ((1 << reduce_bits_horiz) >> 1));
254
255 // Note: The values res_02 + res_46 and res_13 + res_57 both
256 // fit into int16s at this point, but their sum may be too wide to fit
257 // into an int16. However, once we also add round_const, the sum of
258 // all of these fits into a uint16.
259 //
260 // The wrapping behaviour of _mm_add_* is used here to make sure we
261 // get the correct result despite converting between different
262 // (implicit) types.
263 const __m128i res_even = _mm_add_epi16(res_02, res_46);
264 const __m128i res_odd = _mm_add_epi16(res_13, res_57);
265 const __m128i res =
266 _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
267 tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
268 }
269
prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)270 static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
271 __m128i *coeff) {
272 // Filter even-index pixels
273 const __m128i tmp_0 = _mm_loadl_epi64(
274 (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
275 const __m128i tmp_1 = _mm_loadl_epi64(
276 (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
277 const __m128i tmp_2 = _mm_loadl_epi64(
278 (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
279 const __m128i tmp_3 = _mm_loadl_epi64(
280 (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
281 const __m128i tmp_4 = _mm_loadl_epi64(
282 (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
283 const __m128i tmp_5 = _mm_loadl_epi64(
284 (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
285 const __m128i tmp_6 = _mm_loadl_epi64(
286 (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
287 const __m128i tmp_7 = _mm_loadl_epi64(
288 (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
289
290 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
291 const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
292 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
293 const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
294 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
295 const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
296 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
297 const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
298
299 // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
300 const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
301 // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
302 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
303 // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
304 const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
305 // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
306 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
307
308 // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
309 coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
310 // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
311 coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
312 // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
313 coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
314 // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
315 coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
316 }
317
prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)318 static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
319 __m128i *coeff) {
320 // Filter even-index pixels
321 const __m128i tmp_0 =
322 _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
323
324 // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
325 coeff[0] = _mm_shuffle_epi8(
326 tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
327 // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
328 coeff[1] = _mm_shuffle_epi8(
329 tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
330 // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
331 coeff[2] = _mm_shuffle_epi8(
332 tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
333 // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
334 coeff[3] = _mm_shuffle_epi8(
335 tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
336 }
337
horizontal_filter(__m128i src,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)338 static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
339 int alpha, int k,
340 const int offset_bits_horiz,
341 const int reduce_bits_horiz) {
342 __m128i coeff[4];
343 prepare_horizontal_filter_coeff(alpha, sx, coeff);
344 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
345 }
346
warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)347 static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
348 int stride, int32_t ix4, int32_t iy4,
349 int32_t sx4, int alpha, int beta,
350 int p_height, int height, int i,
351 const int offset_bits_horiz,
352 const int reduce_bits_horiz) {
353 int k;
354 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
355 int iy = iy4 + k;
356 if (iy < 0)
357 iy = 0;
358 else if (iy > height - 1)
359 iy = height - 1;
360 int sx = sx4 + beta * (k + 4);
361
362 // Load source pixels
363 const __m128i src =
364 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
365 horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
366 reduce_bits_horiz);
367 }
368 }
369
warp_horizontal_filter_alpha0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)370 static INLINE void warp_horizontal_filter_alpha0(
371 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
372 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
373 const int offset_bits_horiz, const int reduce_bits_horiz) {
374 (void)alpha;
375 int k;
376 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
377 int iy = iy4 + k;
378 if (iy < 0)
379 iy = 0;
380 else if (iy > height - 1)
381 iy = height - 1;
382 int sx = sx4 + beta * (k + 4);
383
384 // Load source pixels
385 const __m128i src =
386 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
387
388 __m128i coeff[4];
389 prepare_horizontal_filter_coeff_alpha0(sx, coeff);
390 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
391 }
392 }
393
warp_horizontal_filter_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)394 static INLINE void warp_horizontal_filter_beta0(
395 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
396 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
397 const int offset_bits_horiz, const int reduce_bits_horiz) {
398 (void)beta;
399 int k;
400 __m128i coeff[4];
401 prepare_horizontal_filter_coeff(alpha, sx4, coeff);
402
403 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
404 int iy = iy4 + k;
405 if (iy < 0)
406 iy = 0;
407 else if (iy > height - 1)
408 iy = height - 1;
409
410 // Load source pixels
411 const __m128i src =
412 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
413 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
414 }
415 }
416
warp_horizontal_filter_alpha0_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)417 static INLINE void warp_horizontal_filter_alpha0_beta0(
418 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
419 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
420 const int offset_bits_horiz, const int reduce_bits_horiz) {
421 (void)beta;
422 (void)alpha;
423 int k;
424
425 __m128i coeff[4];
426 prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
427
428 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
429 int iy = iy4 + k;
430 if (iy < 0)
431 iy = 0;
432 else if (iy > height - 1)
433 iy = height - 1;
434
435 // Load source pixels
436 const __m128i src =
437 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
438 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
439 }
440 }
441
unpack_weights_and_set_round_const(ConvolveParams * conv_params,const int round_bits,const int offset_bits,__m128i * res_sub_const,__m128i * round_bits_const,__m128i * wt)442 static INLINE void unpack_weights_and_set_round_const(
443 ConvolveParams *conv_params, const int round_bits, const int offset_bits,
444 __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
445 *res_sub_const =
446 _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
447 (1 << (offset_bits - conv_params->round_1 - 1)));
448 *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
449
450 const int w0 = conv_params->fwd_offset;
451 const int w1 = conv_params->bck_offset;
452 const __m128i wt0 = _mm_set1_epi16(w0);
453 const __m128i wt1 = _mm_set1_epi16(w1);
454 *wt = _mm_unpacklo_epi16(wt0, wt1);
455 }
456
prepare_vertical_filter_coeffs(int gamma,int sy,__m128i * coeffs)457 static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
458 __m128i *coeffs) {
459 const __m128i tmp_0 = _mm_loadu_si128(
460 (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
461 const __m128i tmp_2 = _mm_loadu_si128(
462 (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
463 const __m128i tmp_4 = _mm_loadu_si128(
464 (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
465 const __m128i tmp_6 = _mm_loadu_si128(
466 (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
467
468 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
469 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
470 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
471 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
472
473 // even coeffs
474 coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
475 coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
476 coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
477 coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
478
479 const __m128i tmp_1 = _mm_loadu_si128(
480 (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
481 const __m128i tmp_3 = _mm_loadu_si128(
482 (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
483 const __m128i tmp_5 = _mm_loadu_si128(
484 (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
485 const __m128i tmp_7 = _mm_loadu_si128(
486 (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
487
488 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
489 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
490 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
491 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
492
493 // odd coeffs
494 coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
495 coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
496 coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
497 coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
498 }
499
prepare_vertical_filter_coeffs_gamma0(int sy,__m128i * coeffs)500 static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
501 __m128i *coeffs) {
502 const __m128i tmp_0 = _mm_loadu_si128(
503 (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
504
505 // even coeffs
506 coeffs[0] =
507 _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
508 coeffs[1] =
509 _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
510 coeffs[2] =
511 _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
512 coeffs[3] =
513 _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
514
515 // odd coeffs
516 coeffs[4] = coeffs[0];
517 coeffs[5] = coeffs[1];
518 coeffs[6] = coeffs[2];
519 coeffs[7] = coeffs[3];
520 }
521
filter_src_pixels_vertical(__m128i * tmp,__m128i * coeffs,__m128i * res_lo,__m128i * res_hi,int k)522 static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
523 __m128i *res_lo, __m128i *res_hi,
524 int k) {
525 // Load from tmp and rearrange pairs of consecutive rows into the
526 // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
527 const __m128i *src = tmp + (k + 4);
528 const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
529 const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
530 const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
531 const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
532
533 const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
534 const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
535 const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
536 const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
537
538 const __m128i res_even =
539 _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
540
541 // Filter odd-index pixels
542 const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
543 const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
544 const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
545 const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
546
547 const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
548 const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
549 const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
550 const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
551
552 const __m128i res_odd =
553 _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
554
555 // Rearrange pixels back into the order 0 ... 7
556 *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
557 *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
558 }
559
store_vertical_filter_output(__m128i * res_lo,__m128i * res_hi,const __m128i * res_add_const,const __m128i * wt,const __m128i * res_sub_const,__m128i * round_bits_const,uint8_t * pred,ConvolveParams * conv_params,int i,int j,int k,const int reduce_bits_vert,int p_stride,int p_width,const int round_bits)560 static INLINE void store_vertical_filter_output(
561 __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
562 const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
563 uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
564 const int reduce_bits_vert, int p_stride, int p_width,
565 const int round_bits) {
566 __m128i res_lo_1 = *res_lo;
567 __m128i res_hi_1 = *res_hi;
568
569 if (conv_params->is_compound) {
570 __m128i *const p =
571 (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
572 res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
573 reduce_bits_vert);
574 const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
575 __m128i res_lo_16;
576 if (conv_params->do_average) {
577 __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
578 const __m128i p_16 = _mm_loadl_epi64(p);
579
580 if (conv_params->use_jnt_comp_avg) {
581 const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
582 const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
583 const __m128i shifted_32 =
584 _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
585 res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
586 } else {
587 res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
588 }
589
590 res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
591
592 res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
593 round_bits);
594 __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
595 *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
596 } else {
597 _mm_storel_epi64(p, temp_lo_16);
598 }
599 if (p_width > 4) {
600 __m128i *const p4 =
601 (__m128i *)&conv_params
602 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
603 res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
604 reduce_bits_vert);
605 const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
606 __m128i res_hi_16;
607
608 if (conv_params->do_average) {
609 __m128i *const dst8_4 =
610 (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
611 const __m128i p4_16 = _mm_loadl_epi64(p4);
612
613 if (conv_params->use_jnt_comp_avg) {
614 const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
615 const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
616 const __m128i shifted_32 =
617 _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
618 res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
619 } else {
620 res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
621 }
622 res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
623
624 res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
625 round_bits);
626 __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
627 *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
628
629 } else {
630 _mm_storel_epi64(p4, temp_hi_16);
631 }
632 }
633 } else {
634 const __m128i res_lo_round = _mm_srai_epi32(
635 _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
636 const __m128i res_hi_round = _mm_srai_epi32(
637 _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
638
639 const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
640 __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
641
642 // Store, blending with 'pred' if needed
643 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
644
645 // Note: If we're outputting a 4x4 block, we need to be very careful
646 // to only output 4 pixels at this point, to avoid encode/decode
647 // mismatches when encoding with multiple threads.
648 if (p_width == 4) {
649 *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
650 } else {
651 _mm_storel_epi64(p, res_8bit);
652 }
653 }
654 }
655
warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)656 static INLINE void warp_vertical_filter(
657 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
658 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
659 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
660 const int round_bits, const int offset_bits) {
661 int k;
662 __m128i res_sub_const, round_bits_const, wt;
663 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
664 &res_sub_const, &round_bits_const, &wt);
665 // Vertical filter
666 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
667 int sy = sy4 + delta * (k + 4);
668
669 __m128i coeffs[8];
670 prepare_vertical_filter_coeffs(gamma, sy, coeffs);
671
672 __m128i res_lo;
673 __m128i res_hi;
674 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
675
676 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
677 &res_sub_const, &round_bits_const, pred,
678 conv_params, i, j, k, reduce_bits_vert,
679 p_stride, p_width, round_bits);
680 }
681 }
682
warp_vertical_filter_gamma0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)683 static INLINE void warp_vertical_filter_gamma0(
684 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
685 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
686 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
687 const int round_bits, const int offset_bits) {
688 int k;
689 (void)gamma;
690 __m128i res_sub_const, round_bits_const, wt;
691 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
692 &res_sub_const, &round_bits_const, &wt);
693 // Vertical filter
694 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
695 int sy = sy4 + delta * (k + 4);
696
697 __m128i coeffs[8];
698 prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
699
700 __m128i res_lo;
701 __m128i res_hi;
702 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
703
704 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
705 &res_sub_const, &round_bits_const, pred,
706 conv_params, i, j, k, reduce_bits_vert,
707 p_stride, p_width, round_bits);
708 }
709 }
710
warp_vertical_filter_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)711 static INLINE void warp_vertical_filter_delta0(
712 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
713 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
714 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
715 const int round_bits, const int offset_bits) {
716 (void)delta;
717 int k;
718 __m128i res_sub_const, round_bits_const, wt;
719 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
720 &res_sub_const, &round_bits_const, &wt);
721
722 __m128i coeffs[8];
723 prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
724 // Vertical filter
725 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
726 __m128i res_lo;
727 __m128i res_hi;
728 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
729
730 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
731 &res_sub_const, &round_bits_const, pred,
732 conv_params, i, j, k, reduce_bits_vert,
733 p_stride, p_width, round_bits);
734 }
735 }
736
warp_vertical_filter_gamma0_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)737 static INLINE void warp_vertical_filter_gamma0_delta0(
738 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
739 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
740 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
741 const int round_bits, const int offset_bits) {
742 (void)delta;
743 (void)gamma;
744 int k;
745 __m128i res_sub_const, round_bits_const, wt;
746 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
747 &res_sub_const, &round_bits_const, &wt);
748
749 __m128i coeffs[8];
750 prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
751 // Vertical filter
752 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
753 __m128i res_lo;
754 __m128i res_hi;
755 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
756
757 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
758 &res_sub_const, &round_bits_const, pred,
759 conv_params, i, j, k, reduce_bits_vert,
760 p_stride, p_width, round_bits);
761 }
762 }
763
prepare_warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)764 static INLINE void prepare_warp_vertical_filter(
765 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
766 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
767 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
768 const int round_bits, const int offset_bits) {
769 if (gamma == 0 && delta == 0)
770 warp_vertical_filter_gamma0_delta0(
771 pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
772 sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
773 else if (gamma == 0 && delta != 0)
774 warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
775 p_stride, p_width, i, j, sy4, reduce_bits_vert,
776 res_add_const, round_bits, offset_bits);
777 else if (gamma != 0 && delta == 0)
778 warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
779 p_stride, p_width, i, j, sy4, reduce_bits_vert,
780 res_add_const, round_bits, offset_bits);
781 else
782 warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
783 p_stride, p_width, i, j, sy4, reduce_bits_vert,
784 res_add_const, round_bits, offset_bits);
785 }
786
prepare_warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)787 static INLINE void prepare_warp_horizontal_filter(
788 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
789 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
790 const int offset_bits_horiz, const int reduce_bits_horiz) {
791 if (alpha == 0 && beta == 0)
792 warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
793 beta, p_height, height, i,
794 offset_bits_horiz, reduce_bits_horiz);
795 else if (alpha == 0 && beta != 0)
796 warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
797 p_height, height, i, offset_bits_horiz,
798 reduce_bits_horiz);
799 else if (alpha != 0 && beta == 0)
800 warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
801 p_height, height, i, offset_bits_horiz,
802 reduce_bits_horiz);
803 else
804 warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
805 p_height, height, i, offset_bits_horiz,
806 reduce_bits_horiz);
807 }
808
av1_warp_affine_sse4_1(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)809 void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
810 int height, int stride, uint8_t *pred, int p_col,
811 int p_row, int p_width, int p_height, int p_stride,
812 int subsampling_x, int subsampling_y,
813 ConvolveParams *conv_params, int16_t alpha,
814 int16_t beta, int16_t gamma, int16_t delta) {
815 __m128i tmp[15];
816 int i, j, k;
817 const int bd = 8;
818 const int reduce_bits_horiz = conv_params->round_0;
819 const int reduce_bits_vert = conv_params->is_compound
820 ? conv_params->round_1
821 : 2 * FILTER_BITS - reduce_bits_horiz;
822 const int offset_bits_horiz = bd + FILTER_BITS - 1;
823 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
824
825 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
826 const __m128i reduce_bits_vert_const =
827 _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
828 const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
829 const int round_bits =
830 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
831 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
832 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
833
834 /* Note: For this code to work, the left/right frame borders need to be
835 extended by at least 13 pixels each. By the time we get here, other
836 code will have set up this border, but we allow an explicit check
837 for debugging purposes.
838 */
839 /*for (i = 0; i < height; ++i) {
840 for (j = 0; j < 13; ++j) {
841 assert(ref[i * stride - 13 + j] == ref[i * stride]);
842 assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
843 }
844 }*/
845 __m128i res_add_const_1;
846 if (conv_params->is_compound == 1) {
847 res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
848 } else {
849 res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
850 ((1 << reduce_bits_vert) >> 1));
851 }
852
853 for (i = 0; i < p_height; i += 8) {
854 for (j = 0; j < p_width; j += 8) {
855 const int32_t src_x = (p_col + j + 4) << subsampling_x;
856 const int32_t src_y = (p_row + i + 4) << subsampling_y;
857 const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
858 const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
859 const int32_t x4 = dst_x >> subsampling_x;
860 const int32_t y4 = dst_y >> subsampling_y;
861
862 int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
863 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
864 int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
865 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
866
867 // Add in all the constant terms, including rounding and offset
868 sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
869 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
870 sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
871 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
872
873 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
874 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
875
876 // Horizontal filter
877 // If the block is aligned such that, after clamping, every sample
878 // would be taken from the leftmost/rightmost column, then we can
879 // skip the expensive horizontal filter.
880 if (ix4 <= -7) {
881 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
882 int iy = iy4 + k;
883 if (iy < 0)
884 iy = 0;
885 else if (iy > height - 1)
886 iy = height - 1;
887 tmp[k + 7] = _mm_set1_epi16(
888 (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
889 ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
890 }
891 } else if (ix4 >= width + 6) {
892 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
893 int iy = iy4 + k;
894 if (iy < 0)
895 iy = 0;
896 else if (iy > height - 1)
897 iy = height - 1;
898 tmp[k + 7] =
899 _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
900 ref[iy * stride + (width - 1)] *
901 (1 << (FILTER_BITS - reduce_bits_horiz)));
902 }
903 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
904 const int out_of_boundary_left = -(ix4 - 6);
905 const int out_of_boundary_right = (ix4 + 8) - width;
906 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
907 int iy = iy4 + k;
908 if (iy < 0)
909 iy = 0;
910 else if (iy > height - 1)
911 iy = height - 1;
912 int sx = sx4 + beta * (k + 4);
913
914 // Load source pixels
915 __m128i src =
916 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
917 if (out_of_boundary_left >= 0) {
918 const __m128i shuffle_reg_left =
919 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
920 src = _mm_shuffle_epi8(src, shuffle_reg_left);
921 }
922 if (out_of_boundary_right >= 0) {
923 const __m128i shuffle_reg_right = _mm_loadu_si128(
924 (__m128i *)warp_pad_right[out_of_boundary_right]);
925 src = _mm_shuffle_epi8(src, shuffle_reg_right);
926 }
927 horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
928 reduce_bits_horiz);
929 }
930 } else {
931 prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
932 beta, p_height, height, i,
933 offset_bits_horiz, reduce_bits_horiz);
934 }
935
936 // Vertical filter
937 prepare_warp_vertical_filter(
938 pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
939 j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
940 }
941 }
942 }
943