1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 #include <smmintrin.h>
14 
15 #include "config/av1_rtcd.h"
16 
17 #include "av1/common/warped_motion.h"
18 
19 /* This is a modified version of 'warped_filter' from warped_motion.c:
20    * Each coefficient is stored in 8 bits instead of 16 bits
21    * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
22 
23      This is done in order to avoid overflow: Since the tap with the largest
24      coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
25      order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
26      convolve functions.
27 
28      Instead, we use the summation order
29      ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
30      The rearrangement of coefficients in this table is so that we can get the
31      coefficients into the correct order more quickly.
32 */
33 /* clang-format off */
34 DECLARE_ALIGNED(8, static const int8_t,
35                 filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
36 #if WARPEDPIXEL_PREC_BITS == 6
37   // [-1, 0)
38   { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
39   { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
40   { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
41   { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
42   { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
43   { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
44   { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
45   { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
46   { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
47   { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
48   { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
49   { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
50   { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
51   { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
52   { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
53   { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
54   { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
55   { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
56   { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
57   { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
58   { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
59   { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
60   { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
61   { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
62   { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
63   { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
64   { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
65   { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
66   { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
67   { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
68   { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
69   { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
70   // [0, 1)
71   { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
72   { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
73   { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
74   {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
75   {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
76   {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
77   {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
78   {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
79   {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
80   {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
81   {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
82   {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
83   {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
84   {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
85   {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
86   {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
87   {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
88   {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
89   {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
90   {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
91   {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
92   {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
93   {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
94   {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
95   {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
96   {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
97   {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
98   {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
99   {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
100   {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
101   { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
102   { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
103   // [1, 2)
104   { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
105   { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
106   { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
107   { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
108   { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
109   { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
110   { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
111   { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
112   { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
113   { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
114   { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
115   { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
116   { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
117   { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
118   { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
119   { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
120   { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
121   { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
122   { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
123   { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
124   { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
125   { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
126   { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
127   { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
128   { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
129   { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
130   { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
131   { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
132   { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
133   { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
134   { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
135   { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
136   // dummy (replicate row index 191)
137   { 0, 0,   2,  -1, 0,   0, 127, 0},
138 
139 #else
140   // [-1, 0)
141   { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
142   { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
143   { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
144   { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
145   { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
146   { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
147   { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
148   { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
149   { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
150   { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
151   { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
152   { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
153   { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
154   { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
155   { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
156   { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
157   // [0, 1)
158   { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
159   { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
160   {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
161   {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
162   {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
163   {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
164   {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
165   {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
166   {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
167   {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
168   {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
169   {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
170   {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
171   {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
172   {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
173   { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
174   // [1, 2)
175   { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
176   { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
177   { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
178   { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
179   { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
180   { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
181   { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
182   { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
183   { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
184   { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
185   { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
186   { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
187   { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
188   { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
189   { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
190   { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
191   // dummy (replicate row index 95)
192   { 0, 0,   4,  -3, 0,  -1, 127, 1},
193 #endif  // WARPEDPIXEL_PREC_BITS == 6
194 };
195 /* clang-format on */
196 
197 // Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
198 // in an SSE register into two sequences:
199 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
200 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
201 static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
202                                        8, 10, 10, 12, 12, 14, 14, 0 };
203 static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
204                                       9, 11, 11, 13, 13, 15, 15, 0 };
205 
206 static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
207                                                    0, 1, 0, 1, 0, 1, 0, 1 };
208 
209 static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
210                                                    2, 3, 2, 3, 2, 3, 2, 3 };
211 
212 static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
213                                                    4, 5, 4, 5, 4, 5, 4, 5 };
214 
215 static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
216                                                    6, 7, 6, 7, 6, 7, 6, 7 };
217 
218 static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
219                                                   0, 1, 2, 3, 0, 1, 2, 3 };
220 static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
221                                                   4, 5, 6, 7, 4, 5, 6, 7 };
222 static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
223                                                   8, 9, 10, 11, 8, 9, 10, 11 };
224 static const uint8_t shuffle_gamma0_mask3[16] = {
225   12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
226 };
227 
filter_src_pixels(__m128i src,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)228 static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
229                                      const int offset_bits_horiz,
230                                      const int reduce_bits_horiz, int k) {
231   const __m128i src_even =
232       _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
233   const __m128i src_odd =
234       _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
235   // The pixel order we need for 'src' is:
236   // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
237   const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
238   const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
239   // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
240   const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
241                                             _mm_srli_si128(src_odd, 4));
242   const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
243   // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
244   const __m128i src_13 =
245       _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
246   const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
247   // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
248   const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
249                                             _mm_srli_si128(src_even, 6));
250   const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
251 
252   const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
253                                              ((1 << reduce_bits_horiz) >> 1));
254 
255   // Note: The values res_02 + res_46 and res_13 + res_57 both
256   // fit into int16s at this point, but their sum may be too wide to fit
257   // into an int16. However, once we also add round_const, the sum of
258   // all of these fits into a uint16.
259   //
260   // The wrapping behaviour of _mm_add_* is used here to make sure we
261   // get the correct result despite converting between different
262   // (implicit) types.
263   const __m128i res_even = _mm_add_epi16(res_02, res_46);
264   const __m128i res_odd = _mm_add_epi16(res_13, res_57);
265   const __m128i res =
266       _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
267   tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
268 }
269 
prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)270 static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
271                                                    __m128i *coeff) {
272   // Filter even-index pixels
273   const __m128i tmp_0 = _mm_loadl_epi64(
274       (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
275   const __m128i tmp_1 = _mm_loadl_epi64(
276       (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
277   const __m128i tmp_2 = _mm_loadl_epi64(
278       (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
279   const __m128i tmp_3 = _mm_loadl_epi64(
280       (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
281   const __m128i tmp_4 = _mm_loadl_epi64(
282       (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
283   const __m128i tmp_5 = _mm_loadl_epi64(
284       (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
285   const __m128i tmp_6 = _mm_loadl_epi64(
286       (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
287   const __m128i tmp_7 = _mm_loadl_epi64(
288       (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
289 
290   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
291   const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
292   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
293   const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
294   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
295   const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
296   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
297   const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
298 
299   // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
300   const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
301   // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
302   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
303   // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
304   const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
305   // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
306   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
307 
308   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
309   coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
310   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
311   coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
312   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
313   coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
314   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
315   coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
316 }
317 
prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)318 static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
319                                                           __m128i *coeff) {
320   // Filter even-index pixels
321   const __m128i tmp_0 =
322       _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
323 
324   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
325   coeff[0] = _mm_shuffle_epi8(
326       tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
327   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
328   coeff[1] = _mm_shuffle_epi8(
329       tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
330   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
331   coeff[2] = _mm_shuffle_epi8(
332       tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
333   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
334   coeff[3] = _mm_shuffle_epi8(
335       tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
336 }
337 
horizontal_filter(__m128i src,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)338 static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
339                                      int alpha, int k,
340                                      const int offset_bits_horiz,
341                                      const int reduce_bits_horiz) {
342   __m128i coeff[4];
343   prepare_horizontal_filter_coeff(alpha, sx, coeff);
344   filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
345 }
346 
warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)347 static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
348                                           int stride, int32_t ix4, int32_t iy4,
349                                           int32_t sx4, int alpha, int beta,
350                                           int p_height, int height, int i,
351                                           const int offset_bits_horiz,
352                                           const int reduce_bits_horiz) {
353   int k;
354   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
355     int iy = iy4 + k;
356     if (iy < 0)
357       iy = 0;
358     else if (iy > height - 1)
359       iy = height - 1;
360     int sx = sx4 + beta * (k + 4);
361 
362     // Load source pixels
363     const __m128i src =
364         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
365     horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
366                       reduce_bits_horiz);
367   }
368 }
369 
warp_horizontal_filter_alpha0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)370 static INLINE void warp_horizontal_filter_alpha0(
371     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
372     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
373     const int offset_bits_horiz, const int reduce_bits_horiz) {
374   (void)alpha;
375   int k;
376   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
377     int iy = iy4 + k;
378     if (iy < 0)
379       iy = 0;
380     else if (iy > height - 1)
381       iy = height - 1;
382     int sx = sx4 + beta * (k + 4);
383 
384     // Load source pixels
385     const __m128i src =
386         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
387 
388     __m128i coeff[4];
389     prepare_horizontal_filter_coeff_alpha0(sx, coeff);
390     filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
391   }
392 }
393 
warp_horizontal_filter_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)394 static INLINE void warp_horizontal_filter_beta0(
395     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
396     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
397     const int offset_bits_horiz, const int reduce_bits_horiz) {
398   (void)beta;
399   int k;
400   __m128i coeff[4];
401   prepare_horizontal_filter_coeff(alpha, sx4, coeff);
402 
403   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
404     int iy = iy4 + k;
405     if (iy < 0)
406       iy = 0;
407     else if (iy > height - 1)
408       iy = height - 1;
409 
410     // Load source pixels
411     const __m128i src =
412         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
413     filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
414   }
415 }
416 
warp_horizontal_filter_alpha0_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)417 static INLINE void warp_horizontal_filter_alpha0_beta0(
418     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
419     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
420     const int offset_bits_horiz, const int reduce_bits_horiz) {
421   (void)beta;
422   (void)alpha;
423   int k;
424 
425   __m128i coeff[4];
426   prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
427 
428   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
429     int iy = iy4 + k;
430     if (iy < 0)
431       iy = 0;
432     else if (iy > height - 1)
433       iy = height - 1;
434 
435     // Load source pixels
436     const __m128i src =
437         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
438     filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
439   }
440 }
441 
unpack_weights_and_set_round_const(ConvolveParams * conv_params,const int round_bits,const int offset_bits,__m128i * res_sub_const,__m128i * round_bits_const,__m128i * wt)442 static INLINE void unpack_weights_and_set_round_const(
443     ConvolveParams *conv_params, const int round_bits, const int offset_bits,
444     __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
445   *res_sub_const =
446       _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
447                      (1 << (offset_bits - conv_params->round_1 - 1)));
448   *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
449 
450   const int w0 = conv_params->fwd_offset;
451   const int w1 = conv_params->bck_offset;
452   const __m128i wt0 = _mm_set1_epi16(w0);
453   const __m128i wt1 = _mm_set1_epi16(w1);
454   *wt = _mm_unpacklo_epi16(wt0, wt1);
455 }
456 
prepare_vertical_filter_coeffs(int gamma,int sy,__m128i * coeffs)457 static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
458                                                   __m128i *coeffs) {
459   const __m128i tmp_0 = _mm_loadu_si128(
460       (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
461   const __m128i tmp_2 = _mm_loadu_si128(
462       (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
463   const __m128i tmp_4 = _mm_loadu_si128(
464       (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
465   const __m128i tmp_6 = _mm_loadu_si128(
466       (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
467 
468   const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
469   const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
470   const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
471   const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
472 
473   // even coeffs
474   coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
475   coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
476   coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
477   coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
478 
479   const __m128i tmp_1 = _mm_loadu_si128(
480       (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
481   const __m128i tmp_3 = _mm_loadu_si128(
482       (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
483   const __m128i tmp_5 = _mm_loadu_si128(
484       (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
485   const __m128i tmp_7 = _mm_loadu_si128(
486       (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
487 
488   const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
489   const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
490   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
491   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
492 
493   // odd coeffs
494   coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
495   coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
496   coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
497   coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
498 }
499 
prepare_vertical_filter_coeffs_gamma0(int sy,__m128i * coeffs)500 static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
501                                                          __m128i *coeffs) {
502   const __m128i tmp_0 = _mm_loadu_si128(
503       (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
504 
505   // even coeffs
506   coeffs[0] =
507       _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
508   coeffs[1] =
509       _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
510   coeffs[2] =
511       _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
512   coeffs[3] =
513       _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
514 
515   // odd coeffs
516   coeffs[4] = coeffs[0];
517   coeffs[5] = coeffs[1];
518   coeffs[6] = coeffs[2];
519   coeffs[7] = coeffs[3];
520 }
521 
filter_src_pixels_vertical(__m128i * tmp,__m128i * coeffs,__m128i * res_lo,__m128i * res_hi,int k)522 static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
523                                               __m128i *res_lo, __m128i *res_hi,
524                                               int k) {
525   // Load from tmp and rearrange pairs of consecutive rows into the
526   // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
527   const __m128i *src = tmp + (k + 4);
528   const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
529   const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
530   const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
531   const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
532 
533   const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
534   const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
535   const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
536   const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
537 
538   const __m128i res_even =
539       _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
540 
541   // Filter odd-index pixels
542   const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
543   const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
544   const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
545   const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
546 
547   const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
548   const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
549   const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
550   const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
551 
552   const __m128i res_odd =
553       _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
554 
555   // Rearrange pixels back into the order 0 ... 7
556   *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
557   *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
558 }
559 
store_vertical_filter_output(__m128i * res_lo,__m128i * res_hi,const __m128i * res_add_const,const __m128i * wt,const __m128i * res_sub_const,__m128i * round_bits_const,uint8_t * pred,ConvolveParams * conv_params,int i,int j,int k,const int reduce_bits_vert,int p_stride,int p_width,const int round_bits)560 static INLINE void store_vertical_filter_output(
561     __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
562     const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
563     uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
564     const int reduce_bits_vert, int p_stride, int p_width,
565     const int round_bits) {
566   __m128i res_lo_1 = *res_lo;
567   __m128i res_hi_1 = *res_hi;
568 
569   if (conv_params->is_compound) {
570     __m128i *const p =
571         (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
572     res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
573                               reduce_bits_vert);
574     const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
575     __m128i res_lo_16;
576     if (conv_params->do_average) {
577       __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
578       const __m128i p_16 = _mm_loadl_epi64(p);
579 
580       if (conv_params->use_jnt_comp_avg) {
581         const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
582         const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
583         const __m128i shifted_32 =
584             _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
585         res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
586       } else {
587         res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
588       }
589 
590       res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
591 
592       res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
593                                  round_bits);
594       __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
595       *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
596     } else {
597       _mm_storel_epi64(p, temp_lo_16);
598     }
599     if (p_width > 4) {
600       __m128i *const p4 =
601           (__m128i *)&conv_params
602               ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
603       res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
604                                 reduce_bits_vert);
605       const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
606       __m128i res_hi_16;
607 
608       if (conv_params->do_average) {
609         __m128i *const dst8_4 =
610             (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
611         const __m128i p4_16 = _mm_loadl_epi64(p4);
612 
613         if (conv_params->use_jnt_comp_avg) {
614           const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
615           const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
616           const __m128i shifted_32 =
617               _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
618           res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
619         } else {
620           res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
621         }
622         res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
623 
624         res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
625                                    round_bits);
626         __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
627         *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
628 
629       } else {
630         _mm_storel_epi64(p4, temp_hi_16);
631       }
632     }
633   } else {
634     const __m128i res_lo_round = _mm_srai_epi32(
635         _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
636     const __m128i res_hi_round = _mm_srai_epi32(
637         _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
638 
639     const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
640     __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
641 
642     // Store, blending with 'pred' if needed
643     __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
644 
645     // Note: If we're outputting a 4x4 block, we need to be very careful
646     // to only output 4 pixels at this point, to avoid encode/decode
647     // mismatches when encoding with multiple threads.
648     if (p_width == 4) {
649       *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
650     } else {
651       _mm_storel_epi64(p, res_8bit);
652     }
653   }
654 }
655 
warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)656 static INLINE void warp_vertical_filter(
657     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
658     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
659     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
660     const int round_bits, const int offset_bits) {
661   int k;
662   __m128i res_sub_const, round_bits_const, wt;
663   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
664                                      &res_sub_const, &round_bits_const, &wt);
665   // Vertical filter
666   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
667     int sy = sy4 + delta * (k + 4);
668 
669     __m128i coeffs[8];
670     prepare_vertical_filter_coeffs(gamma, sy, coeffs);
671 
672     __m128i res_lo;
673     __m128i res_hi;
674     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
675 
676     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
677                                  &res_sub_const, &round_bits_const, pred,
678                                  conv_params, i, j, k, reduce_bits_vert,
679                                  p_stride, p_width, round_bits);
680   }
681 }
682 
warp_vertical_filter_gamma0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)683 static INLINE void warp_vertical_filter_gamma0(
684     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
685     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
686     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
687     const int round_bits, const int offset_bits) {
688   int k;
689   (void)gamma;
690   __m128i res_sub_const, round_bits_const, wt;
691   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
692                                      &res_sub_const, &round_bits_const, &wt);
693   // Vertical filter
694   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
695     int sy = sy4 + delta * (k + 4);
696 
697     __m128i coeffs[8];
698     prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
699 
700     __m128i res_lo;
701     __m128i res_hi;
702     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
703 
704     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
705                                  &res_sub_const, &round_bits_const, pred,
706                                  conv_params, i, j, k, reduce_bits_vert,
707                                  p_stride, p_width, round_bits);
708   }
709 }
710 
warp_vertical_filter_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)711 static INLINE void warp_vertical_filter_delta0(
712     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
713     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
714     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
715     const int round_bits, const int offset_bits) {
716   (void)delta;
717   int k;
718   __m128i res_sub_const, round_bits_const, wt;
719   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
720                                      &res_sub_const, &round_bits_const, &wt);
721 
722   __m128i coeffs[8];
723   prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
724   // Vertical filter
725   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
726     __m128i res_lo;
727     __m128i res_hi;
728     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
729 
730     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
731                                  &res_sub_const, &round_bits_const, pred,
732                                  conv_params, i, j, k, reduce_bits_vert,
733                                  p_stride, p_width, round_bits);
734   }
735 }
736 
warp_vertical_filter_gamma0_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)737 static INLINE void warp_vertical_filter_gamma0_delta0(
738     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
739     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
740     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
741     const int round_bits, const int offset_bits) {
742   (void)delta;
743   (void)gamma;
744   int k;
745   __m128i res_sub_const, round_bits_const, wt;
746   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
747                                      &res_sub_const, &round_bits_const, &wt);
748 
749   __m128i coeffs[8];
750   prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
751   // Vertical filter
752   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
753     __m128i res_lo;
754     __m128i res_hi;
755     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
756 
757     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
758                                  &res_sub_const, &round_bits_const, pred,
759                                  conv_params, i, j, k, reduce_bits_vert,
760                                  p_stride, p_width, round_bits);
761   }
762 }
763 
prepare_warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)764 static INLINE void prepare_warp_vertical_filter(
765     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
766     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
767     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
768     const int round_bits, const int offset_bits) {
769   if (gamma == 0 && delta == 0)
770     warp_vertical_filter_gamma0_delta0(
771         pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
772         sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
773   else if (gamma == 0 && delta != 0)
774     warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
775                                 p_stride, p_width, i, j, sy4, reduce_bits_vert,
776                                 res_add_const, round_bits, offset_bits);
777   else if (gamma != 0 && delta == 0)
778     warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
779                                 p_stride, p_width, i, j, sy4, reduce_bits_vert,
780                                 res_add_const, round_bits, offset_bits);
781   else
782     warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
783                          p_stride, p_width, i, j, sy4, reduce_bits_vert,
784                          res_add_const, round_bits, offset_bits);
785 }
786 
prepare_warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)787 static INLINE void prepare_warp_horizontal_filter(
788     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
789     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
790     const int offset_bits_horiz, const int reduce_bits_horiz) {
791   if (alpha == 0 && beta == 0)
792     warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
793                                         beta, p_height, height, i,
794                                         offset_bits_horiz, reduce_bits_horiz);
795   else if (alpha == 0 && beta != 0)
796     warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
797                                   p_height, height, i, offset_bits_horiz,
798                                   reduce_bits_horiz);
799   else if (alpha != 0 && beta == 0)
800     warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
801                                  p_height, height, i, offset_bits_horiz,
802                                  reduce_bits_horiz);
803   else
804     warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
805                            p_height, height, i, offset_bits_horiz,
806                            reduce_bits_horiz);
807 }
808 
av1_warp_affine_sse4_1(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)809 void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
810                             int height, int stride, uint8_t *pred, int p_col,
811                             int p_row, int p_width, int p_height, int p_stride,
812                             int subsampling_x, int subsampling_y,
813                             ConvolveParams *conv_params, int16_t alpha,
814                             int16_t beta, int16_t gamma, int16_t delta) {
815   __m128i tmp[15];
816   int i, j, k;
817   const int bd = 8;
818   const int reduce_bits_horiz = conv_params->round_0;
819   const int reduce_bits_vert = conv_params->is_compound
820                                    ? conv_params->round_1
821                                    : 2 * FILTER_BITS - reduce_bits_horiz;
822   const int offset_bits_horiz = bd + FILTER_BITS - 1;
823   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
824 
825   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
826   const __m128i reduce_bits_vert_const =
827       _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
828   const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
829   const int round_bits =
830       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
831   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
832   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
833 
834   /* Note: For this code to work, the left/right frame borders need to be
835   extended by at least 13 pixels each. By the time we get here, other
836   code will have set up this border, but we allow an explicit check
837   for debugging purposes.
838   */
839   /*for (i = 0; i < height; ++i) {
840   for (j = 0; j < 13; ++j) {
841   assert(ref[i * stride - 13 + j] == ref[i * stride]);
842   assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
843   }
844   }*/
845   __m128i res_add_const_1;
846   if (conv_params->is_compound == 1) {
847     res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
848   } else {
849     res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
850                                      ((1 << reduce_bits_vert) >> 1));
851   }
852 
853   for (i = 0; i < p_height; i += 8) {
854     for (j = 0; j < p_width; j += 8) {
855       const int32_t src_x = (p_col + j + 4) << subsampling_x;
856       const int32_t src_y = (p_row + i + 4) << subsampling_y;
857       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
858       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
859       const int32_t x4 = dst_x >> subsampling_x;
860       const int32_t y4 = dst_y >> subsampling_y;
861 
862       int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
863       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
864       int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
865       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
866 
867       // Add in all the constant terms, including rounding and offset
868       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
869              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
870       sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
871              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
872 
873       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
874       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
875 
876       // Horizontal filter
877       // If the block is aligned such that, after clamping, every sample
878       // would be taken from the leftmost/rightmost column, then we can
879       // skip the expensive horizontal filter.
880       if (ix4 <= -7) {
881         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
882           int iy = iy4 + k;
883           if (iy < 0)
884             iy = 0;
885           else if (iy > height - 1)
886             iy = height - 1;
887           tmp[k + 7] = _mm_set1_epi16(
888               (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
889               ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
890         }
891       } else if (ix4 >= width + 6) {
892         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
893           int iy = iy4 + k;
894           if (iy < 0)
895             iy = 0;
896           else if (iy > height - 1)
897             iy = height - 1;
898           tmp[k + 7] =
899               _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
900                              ref[iy * stride + (width - 1)] *
901                                  (1 << (FILTER_BITS - reduce_bits_horiz)));
902         }
903       } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
904         const int out_of_boundary_left = -(ix4 - 6);
905         const int out_of_boundary_right = (ix4 + 8) - width;
906         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
907           int iy = iy4 + k;
908           if (iy < 0)
909             iy = 0;
910           else if (iy > height - 1)
911             iy = height - 1;
912           int sx = sx4 + beta * (k + 4);
913 
914           // Load source pixels
915           __m128i src =
916               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
917           if (out_of_boundary_left >= 0) {
918             const __m128i shuffle_reg_left =
919                 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
920             src = _mm_shuffle_epi8(src, shuffle_reg_left);
921           }
922           if (out_of_boundary_right >= 0) {
923             const __m128i shuffle_reg_right = _mm_loadu_si128(
924                 (__m128i *)warp_pad_right[out_of_boundary_right]);
925             src = _mm_shuffle_epi8(src, shuffle_reg_right);
926           }
927           horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
928                             reduce_bits_horiz);
929         }
930       } else {
931         prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
932                                        beta, p_height, height, i,
933                                        offset_bits_horiz, reduce_bits_horiz);
934       }
935 
936       // Vertical filter
937       prepare_warp_vertical_filter(
938           pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
939           j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
940     }
941   }
942 }
943