Lines Matching refs:buf0

8   __m128i buf0[32];  in av1_fdct32_new_sse4_1()  local
19 buf0[j] = input[j * col_num + col]; in av1_fdct32_new_sse4_1()
24 buf1[0] = _mm_add_epi32(buf0[0], buf0[31]); in av1_fdct32_new_sse4_1()
25 buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]); in av1_fdct32_new_sse4_1()
26 buf1[1] = _mm_add_epi32(buf0[1], buf0[30]); in av1_fdct32_new_sse4_1()
27 buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]); in av1_fdct32_new_sse4_1()
28 buf1[2] = _mm_add_epi32(buf0[2], buf0[29]); in av1_fdct32_new_sse4_1()
29 buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]); in av1_fdct32_new_sse4_1()
30 buf1[3] = _mm_add_epi32(buf0[3], buf0[28]); in av1_fdct32_new_sse4_1()
31 buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]); in av1_fdct32_new_sse4_1()
32 buf1[4] = _mm_add_epi32(buf0[4], buf0[27]); in av1_fdct32_new_sse4_1()
33 buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]); in av1_fdct32_new_sse4_1()
34 buf1[5] = _mm_add_epi32(buf0[5], buf0[26]); in av1_fdct32_new_sse4_1()
35 buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]); in av1_fdct32_new_sse4_1()
36 buf1[6] = _mm_add_epi32(buf0[6], buf0[25]); in av1_fdct32_new_sse4_1()
37 buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]); in av1_fdct32_new_sse4_1()
38 buf1[7] = _mm_add_epi32(buf0[7], buf0[24]); in av1_fdct32_new_sse4_1()
39 buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]); in av1_fdct32_new_sse4_1()
40 buf1[8] = _mm_add_epi32(buf0[8], buf0[23]); in av1_fdct32_new_sse4_1()
41 buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]); in av1_fdct32_new_sse4_1()
42 buf1[9] = _mm_add_epi32(buf0[9], buf0[22]); in av1_fdct32_new_sse4_1()
43 buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]); in av1_fdct32_new_sse4_1()
44 buf1[10] = _mm_add_epi32(buf0[10], buf0[21]); in av1_fdct32_new_sse4_1()
45 buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]); in av1_fdct32_new_sse4_1()
46 buf1[11] = _mm_add_epi32(buf0[11], buf0[20]); in av1_fdct32_new_sse4_1()
47 buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]); in av1_fdct32_new_sse4_1()
48 buf1[12] = _mm_add_epi32(buf0[12], buf0[19]); in av1_fdct32_new_sse4_1()
49 buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]); in av1_fdct32_new_sse4_1()
50 buf1[13] = _mm_add_epi32(buf0[13], buf0[18]); in av1_fdct32_new_sse4_1()
51 buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]); in av1_fdct32_new_sse4_1()
52 buf1[14] = _mm_add_epi32(buf0[14], buf0[17]); in av1_fdct32_new_sse4_1()
53 buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]); in av1_fdct32_new_sse4_1()
54 buf1[15] = _mm_add_epi32(buf0[15], buf0[16]); in av1_fdct32_new_sse4_1()
55 buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]); in av1_fdct32_new_sse4_1()
61 buf0[0] = _mm_add_epi32(buf1[0], buf1[15]); in av1_fdct32_new_sse4_1()
62 buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]); in av1_fdct32_new_sse4_1()
63 buf0[1] = _mm_add_epi32(buf1[1], buf1[14]); in av1_fdct32_new_sse4_1()
64 buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]); in av1_fdct32_new_sse4_1()
65 buf0[2] = _mm_add_epi32(buf1[2], buf1[13]); in av1_fdct32_new_sse4_1()
66 buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]); in av1_fdct32_new_sse4_1()
67 buf0[3] = _mm_add_epi32(buf1[3], buf1[12]); in av1_fdct32_new_sse4_1()
68 buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]); in av1_fdct32_new_sse4_1()
69 buf0[4] = _mm_add_epi32(buf1[4], buf1[11]); in av1_fdct32_new_sse4_1()
70 buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]); in av1_fdct32_new_sse4_1()
71 buf0[5] = _mm_add_epi32(buf1[5], buf1[10]); in av1_fdct32_new_sse4_1()
72 buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]); in av1_fdct32_new_sse4_1()
73 buf0[6] = _mm_add_epi32(buf1[6], buf1[9]); in av1_fdct32_new_sse4_1()
74 buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]); in av1_fdct32_new_sse4_1()
75 buf0[7] = _mm_add_epi32(buf1[7], buf1[8]); in av1_fdct32_new_sse4_1()
76 buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]); in av1_fdct32_new_sse4_1()
77 buf0[16] = buf1[16]; in av1_fdct32_new_sse4_1()
78 buf0[17] = buf1[17]; in av1_fdct32_new_sse4_1()
79 buf0[18] = buf1[18]; in av1_fdct32_new_sse4_1()
80 buf0[19] = buf1[19]; in av1_fdct32_new_sse4_1()
81 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], in av1_fdct32_new_sse4_1()
82 buf0[27], bit); in av1_fdct32_new_sse4_1()
83 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], in av1_fdct32_new_sse4_1()
84 buf0[26], bit); in av1_fdct32_new_sse4_1()
85 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], in av1_fdct32_new_sse4_1()
86 buf0[25], bit); in av1_fdct32_new_sse4_1()
87 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], in av1_fdct32_new_sse4_1()
88 buf0[24], bit); in av1_fdct32_new_sse4_1()
89 buf0[28] = buf1[28]; in av1_fdct32_new_sse4_1()
90 buf0[29] = buf1[29]; in av1_fdct32_new_sse4_1()
91 buf0[30] = buf1[30]; in av1_fdct32_new_sse4_1()
92 buf0[31] = buf1[31]; in av1_fdct32_new_sse4_1()
98 buf1[0] = _mm_add_epi32(buf0[0], buf0[7]); in av1_fdct32_new_sse4_1()
99 buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]); in av1_fdct32_new_sse4_1()
100 buf1[1] = _mm_add_epi32(buf0[1], buf0[6]); in av1_fdct32_new_sse4_1()
101 buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]); in av1_fdct32_new_sse4_1()
102 buf1[2] = _mm_add_epi32(buf0[2], buf0[5]); in av1_fdct32_new_sse4_1()
103 buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]); in av1_fdct32_new_sse4_1()
104 buf1[3] = _mm_add_epi32(buf0[3], buf0[4]); in av1_fdct32_new_sse4_1()
105 buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]); in av1_fdct32_new_sse4_1()
106 buf1[8] = buf0[8]; in av1_fdct32_new_sse4_1()
107 buf1[9] = buf0[9]; in av1_fdct32_new_sse4_1()
108 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], in av1_fdct32_new_sse4_1()
110 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], in av1_fdct32_new_sse4_1()
112 buf1[14] = buf0[14]; in av1_fdct32_new_sse4_1()
113 buf1[15] = buf0[15]; in av1_fdct32_new_sse4_1()
114 buf1[16] = _mm_add_epi32(buf0[16], buf0[23]); in av1_fdct32_new_sse4_1()
115 buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]); in av1_fdct32_new_sse4_1()
116 buf1[17] = _mm_add_epi32(buf0[17], buf0[22]); in av1_fdct32_new_sse4_1()
117 buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]); in av1_fdct32_new_sse4_1()
118 buf1[18] = _mm_add_epi32(buf0[18], buf0[21]); in av1_fdct32_new_sse4_1()
119 buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]); in av1_fdct32_new_sse4_1()
120 buf1[19] = _mm_add_epi32(buf0[19], buf0[20]); in av1_fdct32_new_sse4_1()
121 buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]); in av1_fdct32_new_sse4_1()
122 buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]); in av1_fdct32_new_sse4_1()
123 buf1[31] = _mm_add_epi32(buf0[31], buf0[24]); in av1_fdct32_new_sse4_1()
124 buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]); in av1_fdct32_new_sse4_1()
125 buf1[30] = _mm_add_epi32(buf0[30], buf0[25]); in av1_fdct32_new_sse4_1()
126 buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]); in av1_fdct32_new_sse4_1()
127 buf1[29] = _mm_add_epi32(buf0[29], buf0[26]); in av1_fdct32_new_sse4_1()
128 buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]); in av1_fdct32_new_sse4_1()
129 buf1[28] = _mm_add_epi32(buf0[28], buf0[27]); in av1_fdct32_new_sse4_1()
135 buf0[0] = _mm_add_epi32(buf1[0], buf1[3]); in av1_fdct32_new_sse4_1()
136 buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]); in av1_fdct32_new_sse4_1()
137 buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); in av1_fdct32_new_sse4_1()
138 buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); in av1_fdct32_new_sse4_1()
139 buf0[4] = buf1[4]; in av1_fdct32_new_sse4_1()
140 btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], in av1_fdct32_new_sse4_1()
141 buf0[6], bit); in av1_fdct32_new_sse4_1()
142 buf0[7] = buf1[7]; in av1_fdct32_new_sse4_1()
143 buf0[8] = _mm_add_epi32(buf1[8], buf1[11]); in av1_fdct32_new_sse4_1()
144 buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]); in av1_fdct32_new_sse4_1()
145 buf0[9] = _mm_add_epi32(buf1[9], buf1[10]); in av1_fdct32_new_sse4_1()
146 buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]); in av1_fdct32_new_sse4_1()
147 buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]); in av1_fdct32_new_sse4_1()
148 buf0[15] = _mm_add_epi32(buf1[15], buf1[12]); in av1_fdct32_new_sse4_1()
149 buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]); in av1_fdct32_new_sse4_1()
150 buf0[14] = _mm_add_epi32(buf1[14], buf1[13]); in av1_fdct32_new_sse4_1()
151 buf0[16] = buf1[16]; in av1_fdct32_new_sse4_1()
152 buf0[17] = buf1[17]; in av1_fdct32_new_sse4_1()
153 btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], in av1_fdct32_new_sse4_1()
154 buf0[29], bit); in av1_fdct32_new_sse4_1()
155 btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], in av1_fdct32_new_sse4_1()
156 buf0[28], bit); in av1_fdct32_new_sse4_1()
157 btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], in av1_fdct32_new_sse4_1()
158 buf0[27], bit); in av1_fdct32_new_sse4_1()
159 btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], in av1_fdct32_new_sse4_1()
160 buf0[26], bit); in av1_fdct32_new_sse4_1()
161 buf0[22] = buf1[22]; in av1_fdct32_new_sse4_1()
162 buf0[23] = buf1[23]; in av1_fdct32_new_sse4_1()
163 buf0[24] = buf1[24]; in av1_fdct32_new_sse4_1()
164 buf0[25] = buf1[25]; in av1_fdct32_new_sse4_1()
165 buf0[30] = buf1[30]; in av1_fdct32_new_sse4_1()
166 buf0[31] = buf1[31]; in av1_fdct32_new_sse4_1()
172 btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], in av1_fdct32_new_sse4_1()
174 btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], in av1_fdct32_new_sse4_1()
176 buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); in av1_fdct32_new_sse4_1()
177 buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); in av1_fdct32_new_sse4_1()
178 buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); in av1_fdct32_new_sse4_1()
179 buf1[7] = _mm_add_epi32(buf0[7], buf0[6]); in av1_fdct32_new_sse4_1()
180 buf1[8] = buf0[8]; in av1_fdct32_new_sse4_1()
181 btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], in av1_fdct32_new_sse4_1()
183 btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], in av1_fdct32_new_sse4_1()
185 buf1[11] = buf0[11]; in av1_fdct32_new_sse4_1()
186 buf1[12] = buf0[12]; in av1_fdct32_new_sse4_1()
187 buf1[15] = buf0[15]; in av1_fdct32_new_sse4_1()
188 buf1[16] = _mm_add_epi32(buf0[16], buf0[19]); in av1_fdct32_new_sse4_1()
189 buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]); in av1_fdct32_new_sse4_1()
190 buf1[17] = _mm_add_epi32(buf0[17], buf0[18]); in av1_fdct32_new_sse4_1()
191 buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]); in av1_fdct32_new_sse4_1()
192 buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]); in av1_fdct32_new_sse4_1()
193 buf1[23] = _mm_add_epi32(buf0[23], buf0[20]); in av1_fdct32_new_sse4_1()
194 buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]); in av1_fdct32_new_sse4_1()
195 buf1[22] = _mm_add_epi32(buf0[22], buf0[21]); in av1_fdct32_new_sse4_1()
196 buf1[24] = _mm_add_epi32(buf0[24], buf0[27]); in av1_fdct32_new_sse4_1()
197 buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]); in av1_fdct32_new_sse4_1()
198 buf1[25] = _mm_add_epi32(buf0[25], buf0[26]); in av1_fdct32_new_sse4_1()
199 buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]); in av1_fdct32_new_sse4_1()
200 buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]); in av1_fdct32_new_sse4_1()
201 buf1[31] = _mm_add_epi32(buf0[31], buf0[28]); in av1_fdct32_new_sse4_1()
202 buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]); in av1_fdct32_new_sse4_1()
203 buf1[30] = _mm_add_epi32(buf0[30], buf0[29]); in av1_fdct32_new_sse4_1()
209 buf0[0] = buf1[0]; in av1_fdct32_new_sse4_1()
210 buf0[1] = buf1[1]; in av1_fdct32_new_sse4_1()
211 buf0[2] = buf1[2]; in av1_fdct32_new_sse4_1()
212 buf0[3] = buf1[3]; in av1_fdct32_new_sse4_1()
213 btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], in av1_fdct32_new_sse4_1()
215 btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], in av1_fdct32_new_sse4_1()
216 buf0[6], bit); in av1_fdct32_new_sse4_1()
217 buf0[8] = _mm_add_epi32(buf1[8], buf1[9]); in av1_fdct32_new_sse4_1()
218 buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]); in av1_fdct32_new_sse4_1()
219 buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]); in av1_fdct32_new_sse4_1()
220 buf0[11] = _mm_add_epi32(buf1[11], buf1[10]); in av1_fdct32_new_sse4_1()
221 buf0[12] = _mm_add_epi32(buf1[12], buf1[13]); in av1_fdct32_new_sse4_1()
222 buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]); in av1_fdct32_new_sse4_1()
223 buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]); in av1_fdct32_new_sse4_1()
224 buf0[15] = _mm_add_epi32(buf1[15], buf1[14]); in av1_fdct32_new_sse4_1()
225 buf0[16] = buf1[16]; in av1_fdct32_new_sse4_1()
226 btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], in av1_fdct32_new_sse4_1()
227 buf0[30], bit); in av1_fdct32_new_sse4_1()
228 btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], in av1_fdct32_new_sse4_1()
229 buf0[29], bit); in av1_fdct32_new_sse4_1()
230 buf0[19] = buf1[19]; in av1_fdct32_new_sse4_1()
231 buf0[20] = buf1[20]; in av1_fdct32_new_sse4_1()
232 btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], in av1_fdct32_new_sse4_1()
233 buf0[26], bit); in av1_fdct32_new_sse4_1()
234 btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], in av1_fdct32_new_sse4_1()
235 buf0[25], bit); in av1_fdct32_new_sse4_1()
236 buf0[23] = buf1[23]; in av1_fdct32_new_sse4_1()
237 buf0[24] = buf1[24]; in av1_fdct32_new_sse4_1()
238 buf0[27] = buf1[27]; in av1_fdct32_new_sse4_1()
239 buf0[28] = buf1[28]; in av1_fdct32_new_sse4_1()
240 buf0[31] = buf1[31]; in av1_fdct32_new_sse4_1()
246 buf1[0] = buf0[0]; in av1_fdct32_new_sse4_1()
247 buf1[1] = buf0[1]; in av1_fdct32_new_sse4_1()
248 buf1[2] = buf0[2]; in av1_fdct32_new_sse4_1()
249 buf1[3] = buf0[3]; in av1_fdct32_new_sse4_1()
250 buf1[4] = buf0[4]; in av1_fdct32_new_sse4_1()
251 buf1[5] = buf0[5]; in av1_fdct32_new_sse4_1()
252 buf1[6] = buf0[6]; in av1_fdct32_new_sse4_1()
253 buf1[7] = buf0[7]; in av1_fdct32_new_sse4_1()
254 btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], in av1_fdct32_new_sse4_1()
256 btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], in av1_fdct32_new_sse4_1()
258 btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10], in av1_fdct32_new_sse4_1()
260 btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11], in av1_fdct32_new_sse4_1()
262 buf1[16] = _mm_add_epi32(buf0[16], buf0[17]); in av1_fdct32_new_sse4_1()
263 buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]); in av1_fdct32_new_sse4_1()
264 buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]); in av1_fdct32_new_sse4_1()
265 buf1[19] = _mm_add_epi32(buf0[19], buf0[18]); in av1_fdct32_new_sse4_1()
266 buf1[20] = _mm_add_epi32(buf0[20], buf0[21]); in av1_fdct32_new_sse4_1()
267 buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]); in av1_fdct32_new_sse4_1()
268 buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]); in av1_fdct32_new_sse4_1()
269 buf1[23] = _mm_add_epi32(buf0[23], buf0[22]); in av1_fdct32_new_sse4_1()
270 buf1[24] = _mm_add_epi32(buf0[24], buf0[25]); in av1_fdct32_new_sse4_1()
271 buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]); in av1_fdct32_new_sse4_1()
272 buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]); in av1_fdct32_new_sse4_1()
273 buf1[27] = _mm_add_epi32(buf0[27], buf0[26]); in av1_fdct32_new_sse4_1()
274 buf1[28] = _mm_add_epi32(buf0[28], buf0[29]); in av1_fdct32_new_sse4_1()
275 buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]); in av1_fdct32_new_sse4_1()
276 buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]); in av1_fdct32_new_sse4_1()
277 buf1[31] = _mm_add_epi32(buf0[31], buf0[30]); in av1_fdct32_new_sse4_1()
283 buf0[0] = buf1[0]; in av1_fdct32_new_sse4_1()
284 buf0[1] = buf1[1]; in av1_fdct32_new_sse4_1()
285 buf0[2] = buf1[2]; in av1_fdct32_new_sse4_1()
286 buf0[3] = buf1[3]; in av1_fdct32_new_sse4_1()
287 buf0[4] = buf1[4]; in av1_fdct32_new_sse4_1()
288 buf0[5] = buf1[5]; in av1_fdct32_new_sse4_1()
289 buf0[6] = buf1[6]; in av1_fdct32_new_sse4_1()
290 buf0[7] = buf1[7]; in av1_fdct32_new_sse4_1()
291 buf0[8] = buf1[8]; in av1_fdct32_new_sse4_1()
292 buf0[9] = buf1[9]; in av1_fdct32_new_sse4_1()
293 buf0[10] = buf1[10]; in av1_fdct32_new_sse4_1()
294 buf0[11] = buf1[11]; in av1_fdct32_new_sse4_1()
295 buf0[12] = buf1[12]; in av1_fdct32_new_sse4_1()
296 buf0[13] = buf1[13]; in av1_fdct32_new_sse4_1()
297 buf0[14] = buf1[14]; in av1_fdct32_new_sse4_1()
298 buf0[15] = buf1[15]; in av1_fdct32_new_sse4_1()
299 btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], in av1_fdct32_new_sse4_1()
300 buf0[31], bit); in av1_fdct32_new_sse4_1()
301 btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17], in av1_fdct32_new_sse4_1()
302 buf0[30], bit); in av1_fdct32_new_sse4_1()
303 btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18], in av1_fdct32_new_sse4_1()
304 buf0[29], bit); in av1_fdct32_new_sse4_1()
305 btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19], in av1_fdct32_new_sse4_1()
306 buf0[28], bit); in av1_fdct32_new_sse4_1()
307 btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20], in av1_fdct32_new_sse4_1()
308 buf0[27], bit); in av1_fdct32_new_sse4_1()
309 btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21], in av1_fdct32_new_sse4_1()
310 buf0[26], bit); in av1_fdct32_new_sse4_1()
311 btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22], in av1_fdct32_new_sse4_1()
312 buf0[25], bit); in av1_fdct32_new_sse4_1()
313 btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], in av1_fdct32_new_sse4_1()
314 buf0[24], bit); in av1_fdct32_new_sse4_1()
318 buf1[0] = buf0[0]; in av1_fdct32_new_sse4_1()
319 buf1[1] = buf0[16]; in av1_fdct32_new_sse4_1()
320 buf1[2] = buf0[8]; in av1_fdct32_new_sse4_1()
321 buf1[3] = buf0[24]; in av1_fdct32_new_sse4_1()
322 buf1[4] = buf0[4]; in av1_fdct32_new_sse4_1()
323 buf1[5] = buf0[20]; in av1_fdct32_new_sse4_1()
324 buf1[6] = buf0[12]; in av1_fdct32_new_sse4_1()
325 buf1[7] = buf0[28]; in av1_fdct32_new_sse4_1()
326 buf1[8] = buf0[2]; in av1_fdct32_new_sse4_1()
327 buf1[9] = buf0[18]; in av1_fdct32_new_sse4_1()
328 buf1[10] = buf0[10]; in av1_fdct32_new_sse4_1()
329 buf1[11] = buf0[26]; in av1_fdct32_new_sse4_1()
330 buf1[12] = buf0[6]; in av1_fdct32_new_sse4_1()
331 buf1[13] = buf0[22]; in av1_fdct32_new_sse4_1()
332 buf1[14] = buf0[14]; in av1_fdct32_new_sse4_1()
333 buf1[15] = buf0[30]; in av1_fdct32_new_sse4_1()
334 buf1[16] = buf0[1]; in av1_fdct32_new_sse4_1()
335 buf1[17] = buf0[17]; in av1_fdct32_new_sse4_1()
336 buf1[18] = buf0[9]; in av1_fdct32_new_sse4_1()
337 buf1[19] = buf0[25]; in av1_fdct32_new_sse4_1()
338 buf1[20] = buf0[5]; in av1_fdct32_new_sse4_1()
339 buf1[21] = buf0[21]; in av1_fdct32_new_sse4_1()
340 buf1[22] = buf0[13]; in av1_fdct32_new_sse4_1()
341 buf1[23] = buf0[29]; in av1_fdct32_new_sse4_1()
342 buf1[24] = buf0[3]; in av1_fdct32_new_sse4_1()
343 buf1[25] = buf0[19]; in av1_fdct32_new_sse4_1()
344 buf1[26] = buf0[11]; in av1_fdct32_new_sse4_1()
345 buf1[27] = buf0[27]; in av1_fdct32_new_sse4_1()
346 buf1[28] = buf0[7]; in av1_fdct32_new_sse4_1()
347 buf1[29] = buf0[23]; in av1_fdct32_new_sse4_1()
348 buf1[30] = buf0[15]; in av1_fdct32_new_sse4_1()
349 buf1[31] = buf0[31]; in av1_fdct32_new_sse4_1()
362 __m128i buf0[4]; in av1_fadst4_new_sse4_1() local
373 buf0[j] = input[j * col_num + col]; in av1_fadst4_new_sse4_1()
378 buf1[0] = buf0[3]; in av1_fadst4_new_sse4_1()
379 buf1[1] = buf0[0]; in av1_fadst4_new_sse4_1()
380 buf1[2] = buf0[1]; in av1_fadst4_new_sse4_1()
381 buf1[3] = buf0[2]; in av1_fadst4_new_sse4_1()
387 btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1], in av1_fadst4_new_sse4_1()
389 btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], in av1_fadst4_new_sse4_1()
390 buf0[3], bit); in av1_fadst4_new_sse4_1()
394 buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); in av1_fadst4_new_sse4_1()
395 buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); in av1_fadst4_new_sse4_1()
396 buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); in av1_fadst4_new_sse4_1()
397 buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); in av1_fadst4_new_sse4_1()
403 buf0[0] = buf1[0]; in av1_fadst4_new_sse4_1()
404 buf0[1] = buf1[1]; in av1_fadst4_new_sse4_1()
405 btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], in av1_fadst4_new_sse4_1()
406 buf0[3], bit); in av1_fadst4_new_sse4_1()
410 buf1[0] = buf0[0]; in av1_fadst4_new_sse4_1()
411 buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]); in av1_fadst4_new_sse4_1()
412 buf1[2] = buf0[3]; in av1_fadst4_new_sse4_1()
413 buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]); in av1_fadst4_new_sse4_1()
426 __m128i buf0[32]; in av1_fadst32_new_sse4_1() local
437 buf0[j] = input[j * col_num + col]; in av1_fadst32_new_sse4_1()
442 buf1[0] = buf0[31]; in av1_fadst32_new_sse4_1()
443 buf1[1] = buf0[0]; in av1_fadst32_new_sse4_1()
444 buf1[2] = buf0[29]; in av1_fadst32_new_sse4_1()
445 buf1[3] = buf0[2]; in av1_fadst32_new_sse4_1()
446 buf1[4] = buf0[27]; in av1_fadst32_new_sse4_1()
447 buf1[5] = buf0[4]; in av1_fadst32_new_sse4_1()
448 buf1[6] = buf0[25]; in av1_fadst32_new_sse4_1()
449 buf1[7] = buf0[6]; in av1_fadst32_new_sse4_1()
450 buf1[8] = buf0[23]; in av1_fadst32_new_sse4_1()
451 buf1[9] = buf0[8]; in av1_fadst32_new_sse4_1()
452 buf1[10] = buf0[21]; in av1_fadst32_new_sse4_1()
453 buf1[11] = buf0[10]; in av1_fadst32_new_sse4_1()
454 buf1[12] = buf0[19]; in av1_fadst32_new_sse4_1()
455 buf1[13] = buf0[12]; in av1_fadst32_new_sse4_1()
456 buf1[14] = buf0[17]; in av1_fadst32_new_sse4_1()
457 buf1[15] = buf0[14]; in av1_fadst32_new_sse4_1()
458 buf1[16] = buf0[15]; in av1_fadst32_new_sse4_1()
459 buf1[17] = buf0[16]; in av1_fadst32_new_sse4_1()
460 buf1[18] = buf0[13]; in av1_fadst32_new_sse4_1()
461 buf1[19] = buf0[18]; in av1_fadst32_new_sse4_1()
462 buf1[20] = buf0[11]; in av1_fadst32_new_sse4_1()
463 buf1[21] = buf0[20]; in av1_fadst32_new_sse4_1()
464 buf1[22] = buf0[9]; in av1_fadst32_new_sse4_1()
465 buf1[23] = buf0[22]; in av1_fadst32_new_sse4_1()
466 buf1[24] = buf0[7]; in av1_fadst32_new_sse4_1()
467 buf1[25] = buf0[24]; in av1_fadst32_new_sse4_1()
468 buf1[26] = buf0[5]; in av1_fadst32_new_sse4_1()
469 buf1[27] = buf0[26]; in av1_fadst32_new_sse4_1()
470 buf1[28] = buf0[3]; in av1_fadst32_new_sse4_1()
471 buf1[29] = buf0[28]; in av1_fadst32_new_sse4_1()
472 buf1[30] = buf0[1]; in av1_fadst32_new_sse4_1()
473 buf1[31] = buf0[30]; in av1_fadst32_new_sse4_1()
479 btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1], in av1_fadst32_new_sse4_1()
481 btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3], in av1_fadst32_new_sse4_1()
483 btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5], in av1_fadst32_new_sse4_1()
485 btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6], in av1_fadst32_new_sse4_1()
486 buf0[7], bit); in av1_fadst32_new_sse4_1()
487 btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8], in av1_fadst32_new_sse4_1()
488 buf0[9], bit); in av1_fadst32_new_sse4_1()
489 btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10], in av1_fadst32_new_sse4_1()
490 buf0[11], bit); in av1_fadst32_new_sse4_1()
491 btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12], in av1_fadst32_new_sse4_1()
492 buf0[13], bit); in av1_fadst32_new_sse4_1()
493 btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14], in av1_fadst32_new_sse4_1()
494 buf0[15], bit); in av1_fadst32_new_sse4_1()
495 btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16], in av1_fadst32_new_sse4_1()
496 buf0[17], bit); in av1_fadst32_new_sse4_1()
497 btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18], in av1_fadst32_new_sse4_1()
498 buf0[19], bit); in av1_fadst32_new_sse4_1()
499 btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20], in av1_fadst32_new_sse4_1()
500 buf0[21], bit); in av1_fadst32_new_sse4_1()
501 btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22], in av1_fadst32_new_sse4_1()
502 buf0[23], bit); in av1_fadst32_new_sse4_1()
503 btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24], in av1_fadst32_new_sse4_1()
504 buf0[25], bit); in av1_fadst32_new_sse4_1()
505 btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26], in av1_fadst32_new_sse4_1()
506 buf0[27], bit); in av1_fadst32_new_sse4_1()
507 btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28], in av1_fadst32_new_sse4_1()
508 buf0[29], bit); in av1_fadst32_new_sse4_1()
509 btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30], in av1_fadst32_new_sse4_1()
510 buf0[31], bit); in av1_fadst32_new_sse4_1()
514 buf1[0] = _mm_add_epi32(buf0[0], buf0[16]); in av1_fadst32_new_sse4_1()
515 buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]); in av1_fadst32_new_sse4_1()
516 buf1[1] = _mm_add_epi32(buf0[1], buf0[17]); in av1_fadst32_new_sse4_1()
517 buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]); in av1_fadst32_new_sse4_1()
518 buf1[2] = _mm_add_epi32(buf0[2], buf0[18]); in av1_fadst32_new_sse4_1()
519 buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]); in av1_fadst32_new_sse4_1()
520 buf1[3] = _mm_add_epi32(buf0[3], buf0[19]); in av1_fadst32_new_sse4_1()
521 buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]); in av1_fadst32_new_sse4_1()
522 buf1[4] = _mm_add_epi32(buf0[4], buf0[20]); in av1_fadst32_new_sse4_1()
523 buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]); in av1_fadst32_new_sse4_1()
524 buf1[5] = _mm_add_epi32(buf0[5], buf0[21]); in av1_fadst32_new_sse4_1()
525 buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]); in av1_fadst32_new_sse4_1()
526 buf1[6] = _mm_add_epi32(buf0[6], buf0[22]); in av1_fadst32_new_sse4_1()
527 buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]); in av1_fadst32_new_sse4_1()
528 buf1[7] = _mm_add_epi32(buf0[7], buf0[23]); in av1_fadst32_new_sse4_1()
529 buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]); in av1_fadst32_new_sse4_1()
530 buf1[8] = _mm_add_epi32(buf0[8], buf0[24]); in av1_fadst32_new_sse4_1()
531 buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]); in av1_fadst32_new_sse4_1()
532 buf1[9] = _mm_add_epi32(buf0[9], buf0[25]); in av1_fadst32_new_sse4_1()
533 buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]); in av1_fadst32_new_sse4_1()
534 buf1[10] = _mm_add_epi32(buf0[10], buf0[26]); in av1_fadst32_new_sse4_1()
535 buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]); in av1_fadst32_new_sse4_1()
536 buf1[11] = _mm_add_epi32(buf0[11], buf0[27]); in av1_fadst32_new_sse4_1()
537 buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]); in av1_fadst32_new_sse4_1()
538 buf1[12] = _mm_add_epi32(buf0[12], buf0[28]); in av1_fadst32_new_sse4_1()
539 buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]); in av1_fadst32_new_sse4_1()
540 buf1[13] = _mm_add_epi32(buf0[13], buf0[29]); in av1_fadst32_new_sse4_1()
541 buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]); in av1_fadst32_new_sse4_1()
542 buf1[14] = _mm_add_epi32(buf0[14], buf0[30]); in av1_fadst32_new_sse4_1()
543 buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]); in av1_fadst32_new_sse4_1()
544 buf1[15] = _mm_add_epi32(buf0[15], buf0[31]); in av1_fadst32_new_sse4_1()
545 buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]); in av1_fadst32_new_sse4_1()
551 buf0[0] = buf1[0]; in av1_fadst32_new_sse4_1()
552 buf0[1] = buf1[1]; in av1_fadst32_new_sse4_1()
553 buf0[2] = buf1[2]; in av1_fadst32_new_sse4_1()
554 buf0[3] = buf1[3]; in av1_fadst32_new_sse4_1()
555 buf0[4] = buf1[4]; in av1_fadst32_new_sse4_1()
556 buf0[5] = buf1[5]; in av1_fadst32_new_sse4_1()
557 buf0[6] = buf1[6]; in av1_fadst32_new_sse4_1()
558 buf0[7] = buf1[7]; in av1_fadst32_new_sse4_1()
559 buf0[8] = buf1[8]; in av1_fadst32_new_sse4_1()
560 buf0[9] = buf1[9]; in av1_fadst32_new_sse4_1()
561 buf0[10] = buf1[10]; in av1_fadst32_new_sse4_1()
562 buf0[11] = buf1[11]; in av1_fadst32_new_sse4_1()
563 buf0[12] = buf1[12]; in av1_fadst32_new_sse4_1()
564 buf0[13] = buf1[13]; in av1_fadst32_new_sse4_1()
565 buf0[14] = buf1[14]; in av1_fadst32_new_sse4_1()
566 buf0[15] = buf1[15]; in av1_fadst32_new_sse4_1()
567 btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16], in av1_fadst32_new_sse4_1()
568 buf0[17], bit); in av1_fadst32_new_sse4_1()
569 btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18], in av1_fadst32_new_sse4_1()
570 buf0[19], bit); in av1_fadst32_new_sse4_1()
571 btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20], in av1_fadst32_new_sse4_1()
572 buf0[21], bit); in av1_fadst32_new_sse4_1()
573 btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22], in av1_fadst32_new_sse4_1()
574 buf0[23], bit); in av1_fadst32_new_sse4_1()
575 btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24], in av1_fadst32_new_sse4_1()
576 buf0[25], bit); in av1_fadst32_new_sse4_1()
577 btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26], in av1_fadst32_new_sse4_1()
578 buf0[27], bit); in av1_fadst32_new_sse4_1()
579 btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28], in av1_fadst32_new_sse4_1()
580 buf0[29], bit); in av1_fadst32_new_sse4_1()
581 btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30], in av1_fadst32_new_sse4_1()
582 buf0[31], bit); in av1_fadst32_new_sse4_1()
586 buf1[0] = _mm_add_epi32(buf0[0], buf0[8]); in av1_fadst32_new_sse4_1()
587 buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]); in av1_fadst32_new_sse4_1()
588 buf1[1] = _mm_add_epi32(buf0[1], buf0[9]); in av1_fadst32_new_sse4_1()
589 buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]); in av1_fadst32_new_sse4_1()
590 buf1[2] = _mm_add_epi32(buf0[2], buf0[10]); in av1_fadst32_new_sse4_1()
591 buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]); in av1_fadst32_new_sse4_1()
592 buf1[3] = _mm_add_epi32(buf0[3], buf0[11]); in av1_fadst32_new_sse4_1()
593 buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]); in av1_fadst32_new_sse4_1()
594 buf1[4] = _mm_add_epi32(buf0[4], buf0[12]); in av1_fadst32_new_sse4_1()
595 buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]); in av1_fadst32_new_sse4_1()
596 buf1[5] = _mm_add_epi32(buf0[5], buf0[13]); in av1_fadst32_new_sse4_1()
597 buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]); in av1_fadst32_new_sse4_1()
598 buf1[6] = _mm_add_epi32(buf0[6], buf0[14]); in av1_fadst32_new_sse4_1()
599 buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]); in av1_fadst32_new_sse4_1()
600 buf1[7] = _mm_add_epi32(buf0[7], buf0[15]); in av1_fadst32_new_sse4_1()
601 buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]); in av1_fadst32_new_sse4_1()
602 buf1[16] = _mm_add_epi32(buf0[16], buf0[24]); in av1_fadst32_new_sse4_1()
603 buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]); in av1_fadst32_new_sse4_1()
604 buf1[17] = _mm_add_epi32(buf0[17], buf0[25]); in av1_fadst32_new_sse4_1()
605 buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]); in av1_fadst32_new_sse4_1()
606 buf1[18] = _mm_add_epi32(buf0[18], buf0[26]); in av1_fadst32_new_sse4_1()
607 buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]); in av1_fadst32_new_sse4_1()
608 buf1[19] = _mm_add_epi32(buf0[19], buf0[27]); in av1_fadst32_new_sse4_1()
609 buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]); in av1_fadst32_new_sse4_1()
610 buf1[20] = _mm_add_epi32(buf0[20], buf0[28]); in av1_fadst32_new_sse4_1()
611 buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]); in av1_fadst32_new_sse4_1()
612 buf1[21] = _mm_add_epi32(buf0[21], buf0[29]); in av1_fadst32_new_sse4_1()
613 buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]); in av1_fadst32_new_sse4_1()
614 buf1[22] = _mm_add_epi32(buf0[22], buf0[30]); in av1_fadst32_new_sse4_1()
615 buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]); in av1_fadst32_new_sse4_1()
616 buf1[23] = _mm_add_epi32(buf0[23], buf0[31]); in av1_fadst32_new_sse4_1()
617 buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]); in av1_fadst32_new_sse4_1()
623 buf0[0] = buf1[0]; in av1_fadst32_new_sse4_1()
624 buf0[1] = buf1[1]; in av1_fadst32_new_sse4_1()
625 buf0[2] = buf1[2]; in av1_fadst32_new_sse4_1()
626 buf0[3] = buf1[3]; in av1_fadst32_new_sse4_1()
627 buf0[4] = buf1[4]; in av1_fadst32_new_sse4_1()
628 buf0[5] = buf1[5]; in av1_fadst32_new_sse4_1()
629 buf0[6] = buf1[6]; in av1_fadst32_new_sse4_1()
630 buf0[7] = buf1[7]; in av1_fadst32_new_sse4_1()
631 btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9], in av1_fadst32_new_sse4_1()
633 btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10], in av1_fadst32_new_sse4_1()
634 buf0[11], bit); in av1_fadst32_new_sse4_1()
635 btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12], in av1_fadst32_new_sse4_1()
636 buf0[13], bit); in av1_fadst32_new_sse4_1()
637 btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14], in av1_fadst32_new_sse4_1()
638 buf0[15], bit); in av1_fadst32_new_sse4_1()
639 buf0[16] = buf1[16]; in av1_fadst32_new_sse4_1()
640 buf0[17] = buf1[17]; in av1_fadst32_new_sse4_1()
641 buf0[18] = buf1[18]; in av1_fadst32_new_sse4_1()
642 buf0[19] = buf1[19]; in av1_fadst32_new_sse4_1()
643 buf0[20] = buf1[20]; in av1_fadst32_new_sse4_1()
644 buf0[21] = buf1[21]; in av1_fadst32_new_sse4_1()
645 buf0[22] = buf1[22]; in av1_fadst32_new_sse4_1()
646 buf0[23] = buf1[23]; in av1_fadst32_new_sse4_1()
647 btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24], in av1_fadst32_new_sse4_1()
648 buf0[25], bit); in av1_fadst32_new_sse4_1()
649 btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26], in av1_fadst32_new_sse4_1()
650 buf0[27], bit); in av1_fadst32_new_sse4_1()
651 btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28], in av1_fadst32_new_sse4_1()
652 buf0[29], bit); in av1_fadst32_new_sse4_1()
653 btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30], in av1_fadst32_new_sse4_1()
654 buf0[31], bit); in av1_fadst32_new_sse4_1()
658 buf1[0] = _mm_add_epi32(buf0[0], buf0[4]); in av1_fadst32_new_sse4_1()
659 buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]); in av1_fadst32_new_sse4_1()
660 buf1[1] = _mm_add_epi32(buf0[1], buf0[5]); in av1_fadst32_new_sse4_1()
661 buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]); in av1_fadst32_new_sse4_1()
662 buf1[2] = _mm_add_epi32(buf0[2], buf0[6]); in av1_fadst32_new_sse4_1()
663 buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]); in av1_fadst32_new_sse4_1()
664 buf1[3] = _mm_add_epi32(buf0[3], buf0[7]); in av1_fadst32_new_sse4_1()
665 buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]); in av1_fadst32_new_sse4_1()
666 buf1[8] = _mm_add_epi32(buf0[8], buf0[12]); in av1_fadst32_new_sse4_1()
667 buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]); in av1_fadst32_new_sse4_1()
668 buf1[9] = _mm_add_epi32(buf0[9], buf0[13]); in av1_fadst32_new_sse4_1()
669 buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]); in av1_fadst32_new_sse4_1()
670 buf1[10] = _mm_add_epi32(buf0[10], buf0[14]); in av1_fadst32_new_sse4_1()
671 buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]); in av1_fadst32_new_sse4_1()
672 buf1[11] = _mm_add_epi32(buf0[11], buf0[15]); in av1_fadst32_new_sse4_1()
673 buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]); in av1_fadst32_new_sse4_1()
674 buf1[16] = _mm_add_epi32(buf0[16], buf0[20]); in av1_fadst32_new_sse4_1()
675 buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]); in av1_fadst32_new_sse4_1()
676 buf1[17] = _mm_add_epi32(buf0[17], buf0[21]); in av1_fadst32_new_sse4_1()
677 buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]); in av1_fadst32_new_sse4_1()
678 buf1[18] = _mm_add_epi32(buf0[18], buf0[22]); in av1_fadst32_new_sse4_1()
679 buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]); in av1_fadst32_new_sse4_1()
680 buf1[19] = _mm_add_epi32(buf0[19], buf0[23]); in av1_fadst32_new_sse4_1()
681 buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]); in av1_fadst32_new_sse4_1()
682 buf1[24] = _mm_add_epi32(buf0[24], buf0[28]); in av1_fadst32_new_sse4_1()
683 buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]); in av1_fadst32_new_sse4_1()
684 buf1[25] = _mm_add_epi32(buf0[25], buf0[29]); in av1_fadst32_new_sse4_1()
685 buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]); in av1_fadst32_new_sse4_1()
686 buf1[26] = _mm_add_epi32(buf0[26], buf0[30]); in av1_fadst32_new_sse4_1()
687 buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]); in av1_fadst32_new_sse4_1()
688 buf1[27] = _mm_add_epi32(buf0[27], buf0[31]); in av1_fadst32_new_sse4_1()
689 buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]); in av1_fadst32_new_sse4_1()
695 buf0[0] = buf1[0]; in av1_fadst32_new_sse4_1()
696 buf0[1] = buf1[1]; in av1_fadst32_new_sse4_1()
697 buf0[2] = buf1[2]; in av1_fadst32_new_sse4_1()
698 buf0[3] = buf1[3]; in av1_fadst32_new_sse4_1()
699 btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], in av1_fadst32_new_sse4_1()
700 buf0[5], bit); in av1_fadst32_new_sse4_1()
701 btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], in av1_fadst32_new_sse4_1()
702 buf0[7], bit); in av1_fadst32_new_sse4_1()
703 buf0[8] = buf1[8]; in av1_fadst32_new_sse4_1()
704 buf0[9] = buf1[9]; in av1_fadst32_new_sse4_1()
705 buf0[10] = buf1[10]; in av1_fadst32_new_sse4_1()
706 buf0[11] = buf1[11]; in av1_fadst32_new_sse4_1()
707 btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12], in av1_fadst32_new_sse4_1()
708 buf0[13], bit); in av1_fadst32_new_sse4_1()
709 btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14], in av1_fadst32_new_sse4_1()
710 buf0[15], bit); in av1_fadst32_new_sse4_1()
711 buf0[16] = buf1[16]; in av1_fadst32_new_sse4_1()
712 buf0[17] = buf1[17]; in av1_fadst32_new_sse4_1()
713 buf0[18] = buf1[18]; in av1_fadst32_new_sse4_1()
714 buf0[19] = buf1[19]; in av1_fadst32_new_sse4_1()
715 btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20], in av1_fadst32_new_sse4_1()
716 buf0[21], bit); in av1_fadst32_new_sse4_1()
717 btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22], in av1_fadst32_new_sse4_1()
718 buf0[23], bit); in av1_fadst32_new_sse4_1()
719 buf0[24] = buf1[24]; in av1_fadst32_new_sse4_1()
720 buf0[25] = buf1[25]; in av1_fadst32_new_sse4_1()
721 buf0[26] = buf1[26]; in av1_fadst32_new_sse4_1()
722 buf0[27] = buf1[27]; in av1_fadst32_new_sse4_1()
723 btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28], in av1_fadst32_new_sse4_1()
724 buf0[29], bit); in av1_fadst32_new_sse4_1()
725 btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30], in av1_fadst32_new_sse4_1()
726 buf0[31], bit); in av1_fadst32_new_sse4_1()
730 buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); in av1_fadst32_new_sse4_1()
731 buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); in av1_fadst32_new_sse4_1()
732 buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); in av1_fadst32_new_sse4_1()
733 buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); in av1_fadst32_new_sse4_1()
734 buf1[4] = _mm_add_epi32(buf0[4], buf0[6]); in av1_fadst32_new_sse4_1()
735 buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]); in av1_fadst32_new_sse4_1()
736 buf1[5] = _mm_add_epi32(buf0[5], buf0[7]); in av1_fadst32_new_sse4_1()
737 buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]); in av1_fadst32_new_sse4_1()
738 buf1[8] = _mm_add_epi32(buf0[8], buf0[10]); in av1_fadst32_new_sse4_1()
739 buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]); in av1_fadst32_new_sse4_1()
740 buf1[9] = _mm_add_epi32(buf0[9], buf0[11]); in av1_fadst32_new_sse4_1()
741 buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]); in av1_fadst32_new_sse4_1()
742 buf1[12] = _mm_add_epi32(buf0[12], buf0[14]); in av1_fadst32_new_sse4_1()
743 buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]); in av1_fadst32_new_sse4_1()
744 buf1[13] = _mm_add_epi32(buf0[13], buf0[15]); in av1_fadst32_new_sse4_1()
745 buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]); in av1_fadst32_new_sse4_1()
746 buf1[16] = _mm_add_epi32(buf0[16], buf0[18]); in av1_fadst32_new_sse4_1()
747 buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]); in av1_fadst32_new_sse4_1()
748 buf1[17] = _mm_add_epi32(buf0[17], buf0[19]); in av1_fadst32_new_sse4_1()
749 buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]); in av1_fadst32_new_sse4_1()
750 buf1[20] = _mm_add_epi32(buf0[20], buf0[22]); in av1_fadst32_new_sse4_1()
751 buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]); in av1_fadst32_new_sse4_1()
752 buf1[21] = _mm_add_epi32(buf0[21], buf0[23]); in av1_fadst32_new_sse4_1()
753 buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]); in av1_fadst32_new_sse4_1()
754 buf1[24] = _mm_add_epi32(buf0[24], buf0[26]); in av1_fadst32_new_sse4_1()
755 buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]); in av1_fadst32_new_sse4_1()
756 buf1[25] = _mm_add_epi32(buf0[25], buf0[27]); in av1_fadst32_new_sse4_1()
757 buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]); in av1_fadst32_new_sse4_1()
758 buf1[28] = _mm_add_epi32(buf0[28], buf0[30]); in av1_fadst32_new_sse4_1()
759 buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]); in av1_fadst32_new_sse4_1()
760 buf1[29] = _mm_add_epi32(buf0[29], buf0[31]); in av1_fadst32_new_sse4_1()
761 buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]); in av1_fadst32_new_sse4_1()
767 buf0[0] = buf1[0]; in av1_fadst32_new_sse4_1()
768 buf0[1] = buf1[1]; in av1_fadst32_new_sse4_1()
769 btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], in av1_fadst32_new_sse4_1()
770 buf0[3], bit); in av1_fadst32_new_sse4_1()
771 buf0[4] = buf1[4]; in av1_fadst32_new_sse4_1()
772 buf0[5] = buf1[5]; in av1_fadst32_new_sse4_1()
773 btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], in av1_fadst32_new_sse4_1()
774 buf0[7], bit); in av1_fadst32_new_sse4_1()
775 buf0[8] = buf1[8]; in av1_fadst32_new_sse4_1()
776 buf0[9] = buf1[9]; in av1_fadst32_new_sse4_1()
777 btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10], in av1_fadst32_new_sse4_1()
778 buf0[11], bit); in av1_fadst32_new_sse4_1()
779 buf0[12] = buf1[12]; in av1_fadst32_new_sse4_1()
780 buf0[13] = buf1[13]; in av1_fadst32_new_sse4_1()
781 btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14], in av1_fadst32_new_sse4_1()
782 buf0[15], bit); in av1_fadst32_new_sse4_1()
783 buf0[16] = buf1[16]; in av1_fadst32_new_sse4_1()
784 buf0[17] = buf1[17]; in av1_fadst32_new_sse4_1()
785 btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18], in av1_fadst32_new_sse4_1()
786 buf0[19], bit); in av1_fadst32_new_sse4_1()
787 buf0[20] = buf1[20]; in av1_fadst32_new_sse4_1()
788 buf0[21] = buf1[21]; in av1_fadst32_new_sse4_1()
789 btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22], in av1_fadst32_new_sse4_1()
790 buf0[23], bit); in av1_fadst32_new_sse4_1()
791 buf0[24] = buf1[24]; in av1_fadst32_new_sse4_1()
792 buf0[25] = buf1[25]; in av1_fadst32_new_sse4_1()
793 btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26], in av1_fadst32_new_sse4_1()
794 buf0[27], bit); in av1_fadst32_new_sse4_1()
795 buf0[28] = buf1[28]; in av1_fadst32_new_sse4_1()
796 buf0[29] = buf1[29]; in av1_fadst32_new_sse4_1()
797 btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30], in av1_fadst32_new_sse4_1()
798 buf0[31], bit); in av1_fadst32_new_sse4_1()
802 buf1[0] = buf0[0]; in av1_fadst32_new_sse4_1()
803 buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]); in av1_fadst32_new_sse4_1()
804 buf1[2] = buf0[24]; in av1_fadst32_new_sse4_1()
805 buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]); in av1_fadst32_new_sse4_1()
806 buf1[4] = buf0[12]; in av1_fadst32_new_sse4_1()
807 buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]); in av1_fadst32_new_sse4_1()
808 buf1[6] = buf0[20]; in av1_fadst32_new_sse4_1()
809 buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]); in av1_fadst32_new_sse4_1()
810 buf1[8] = buf0[6]; in av1_fadst32_new_sse4_1()
811 buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]); in av1_fadst32_new_sse4_1()
812 buf1[10] = buf0[30]; in av1_fadst32_new_sse4_1()
813 buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]); in av1_fadst32_new_sse4_1()
814 buf1[12] = buf0[10]; in av1_fadst32_new_sse4_1()
815 buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]); in av1_fadst32_new_sse4_1()
816 buf1[14] = buf0[18]; in av1_fadst32_new_sse4_1()
817 buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]); in av1_fadst32_new_sse4_1()
818 buf1[16] = buf0[3]; in av1_fadst32_new_sse4_1()
819 buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]); in av1_fadst32_new_sse4_1()
820 buf1[18] = buf0[27]; in av1_fadst32_new_sse4_1()
821 buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]); in av1_fadst32_new_sse4_1()
822 buf1[20] = buf0[15]; in av1_fadst32_new_sse4_1()
823 buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]); in av1_fadst32_new_sse4_1()
824 buf1[22] = buf0[23]; in av1_fadst32_new_sse4_1()
825 buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]); in av1_fadst32_new_sse4_1()
826 buf1[24] = buf0[5]; in av1_fadst32_new_sse4_1()
827 buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]); in av1_fadst32_new_sse4_1()
828 buf1[26] = buf0[29]; in av1_fadst32_new_sse4_1()
829 buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]); in av1_fadst32_new_sse4_1()
830 buf1[28] = buf0[9]; in av1_fadst32_new_sse4_1()
831 buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]); in av1_fadst32_new_sse4_1()
832 buf1[30] = buf0[17]; in av1_fadst32_new_sse4_1()
833 buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]); in av1_fadst32_new_sse4_1()