1 /*******************************************************************************
2 * Copyright 2020-2021 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16
17 #include "cpu/x64/jit_generator.hpp"
18
19 #include "cpu/x64/gemm/s8x8s32/common_u8.hpp"
20
21 namespace dnnl {
22 namespace impl {
23 namespace cpu {
24 namespace x64 {
25
jit_avx_kernel_gemm_s8u8s32_kern()26 jit_avx_kernel_gemm_s8u8s32_kern::jit_avx_kernel_gemm_s8u8s32_kern()
27 : jit_generator(nullptr, S8U8S32_COMPUTE_KERNEL_CODE_SIZE) {}
28
generate()29 void jit_avx_kernel_gemm_s8u8s32_kern::generate() {
30
31 #ifndef _WIN32
32
33 #define M rdi
34 #define N rsi
35 #define K rdx
36 #define A r8
37 #define B r9
38 #define C r10
39 #define LDC r11
40
41 #define AA rcx
42 #define I r12
43 #define J r13
44 #define H rax
45 #define AO r14
46 #define BO r15
47 #define CO1 rbx
48 #define CO2 rbp
49
50 #else
51
52 #define M rcx
53 #define N rdx
54 #define K r8
55 #define A rsi
56 #define B r9
57 #define C r10
58 #define LDC r11
59
60 #define AA rdi
61 #define I r12
62 #define J r13
63 #define H rax
64 #define AO r14
65 #define BO r15
66 #define CO1 rbx
67 #define CO2 rbp
68
69 #endif
70
71 #ifdef _WIN32
72 #define ARG_A (args_offset - 16) + rsp
73 #define ARG_B (args_offset - 8) + rsp
74 #endif
75 #define ARG_C ((args_offset + 0) + rsp)
76 #define ARG_LDC ((args_offset + 8) + rsp)
77
78 inLocalLabel();
79 {
80 std::vector<Xbyak::Label> labels(91);
81
82 auto stack_alloc_size = 32;
83 auto args_offset = stack_alloc_size + get_size_of_abi_save_regs() + 8;
84 #ifdef _WIN32
85 args_offset += 48;
86 #endif
87 preamble();
88 sub(rsp, stack_alloc_size);
89 #ifdef _WIN32
90 mov(A, ptr[ARG_A]);
91 mov(B, ptr[ARG_B]);
92 #endif
93
94 mov(C, qword[ARG_C]);
95 mov(LDC, qword[ARG_LDC]);
96 sub(A, -128);
97 sub(B, -128);
98 mov(M, qword[M]);
99 mov(N, qword[N]);
100 mov(K, qword[K]);
101 lea(LDC, ptr[LDC * 4 + 0x0]);
102 vxorps(xmm8, xmm8, xmm8);
103 vxorps(xmm9, xmm9, xmm9);
104 vxorps(xmm10, xmm10, xmm10);
105 vxorps(xmm11, xmm11, xmm11);
106 vxorps(xmm12, xmm12, xmm12);
107 vxorps(xmm13, xmm13, xmm13);
108 vxorps(xmm14, xmm14, xmm14);
109 vxorps(xmm15, xmm15, xmm15);
110 mov(H, 0x10001);
111 movq(xmm7, H);
112 vpshufd(xmm7, xmm7, 0x0);
113 mov(J, M);
114 cmp(J, 0x10);
115 jl(labels[73], T_NEAR);
116 align(4);
117
118 L(labels[70]);
119 mov(CO1, C);
120 add(C, 0x40);
121 mov(BO, B);
122 mov(AA, K);
123 shl(AA, 0x20);
124 lea(AA, ptr[A + AA * 1 + 0x200]);
125 mov(I, N);
126 cmp(I, 0x2);
127 jl(labels[63], T_NEAR);
128 align(4);
129
130 L(labels[78]);
131 mov(AO, A);
132 vmovdqu(xmm0, xword[AO - 0x80]);
133 vmovdqu(xmm1, xword[AO - 0x70]);
134 vmovdqu(xmm2, xword[AO - 0x60]);
135 vmovdqu(xmm3, xword[AO - 0x50]);
136 vmovdqu(xmm5, xword[BO - 0x80]);
137 mov(H, K);
138 sar(H, 0x3);
139 jle(labels[59], T_NEAR);
140 sub(H, 0x8);
141 jle(labels[57], T_NEAR);
142 align(4);
143
144 L(labels[88]);
145 vpshufd(xmm4, xmm5, 0x0);
146 vpmaddubsw(xmm6, xmm4, xmm0);
147 vpmaddwd(xmm6, xmm7, xmm6);
148 vpaddd(xmm8, xmm8, xmm6);
149 prefetcht0(byte[AO + 0x180]);
150 vpmaddubsw(xmm6, xmm4, xmm1);
151 vpmaddwd(xmm6, xmm7, xmm6);
152 vpaddd(xmm10, xmm10, xmm6);
153 vpmaddubsw(xmm6, xmm4, xmm2);
154 vpmaddwd(xmm6, xmm7, xmm6);
155 vpaddd(xmm12, xmm12, xmm6);
156 vpmaddubsw(xmm6, xmm4, xmm3);
157 vpmaddwd(xmm6, xmm7, xmm6);
158 vpaddd(xmm14, xmm14, xmm6);
159 prefetcht0(byte[BO]);
160 vpshufd(xmm4, xmm5, 0x55);
161 vpmaddubsw(xmm6, xmm4, xmm0);
162 vpmaddwd(xmm6, xmm7, xmm6);
163 vpaddd(xmm9, xmm9, xmm6);
164 vpmaddubsw(xmm6, xmm4, xmm1);
165 vpmaddwd(xmm6, xmm7, xmm6);
166 vpaddd(xmm11, xmm11, xmm6);
167 vpmaddubsw(xmm6, xmm4, xmm2);
168 vpmaddwd(xmm6, xmm7, xmm6);
169 vpaddd(xmm13, xmm13, xmm6);
170 vpmaddubsw(xmm6, xmm4, xmm3);
171 vpmaddwd(xmm6, xmm7, xmm6);
172 vpaddd(xmm15, xmm15, xmm6);
173 vmovdqu(xmm0, xword[AO - 0x40]);
174 vmovdqu(xmm1, xword[AO - 0x30]);
175 vmovdqu(xmm2, xword[AO - 0x20]);
176 vmovdqu(xmm3, xword[AO - 0x10]);
177 prefetcht0(byte[AO + 0x1c0]);
178 vpshufd(xmm4, xmm5, 0xaa);
179 vpmaddubsw(xmm6, xmm4, xmm0);
180 vpmaddwd(xmm6, xmm7, xmm6);
181 vpaddd(xmm8, xmm8, xmm6);
182 vpmaddubsw(xmm6, xmm4, xmm1);
183 vpmaddwd(xmm6, xmm7, xmm6);
184 vpaddd(xmm10, xmm10, xmm6);
185 vpmaddubsw(xmm6, xmm4, xmm2);
186 vpmaddwd(xmm6, xmm7, xmm6);
187 vpaddd(xmm12, xmm12, xmm6);
188 vpmaddubsw(xmm6, xmm4, xmm3);
189 vpmaddwd(xmm6, xmm7, xmm6);
190 vpaddd(xmm14, xmm14, xmm6);
191 vpshufd(xmm4, xmm5, 0xff);
192 vpmaddubsw(xmm6, xmm4, xmm0);
193 vpmaddwd(xmm6, xmm7, xmm6);
194 vpaddd(xmm9, xmm9, xmm6);
195 vpmaddubsw(xmm6, xmm4, xmm1);
196 vpmaddwd(xmm6, xmm7, xmm6);
197 vpaddd(xmm11, xmm11, xmm6);
198 vpmaddubsw(xmm6, xmm4, xmm2);
199 vpmaddwd(xmm6, xmm7, xmm6);
200 vpaddd(xmm13, xmm13, xmm6);
201 vpmaddubsw(xmm6, xmm4, xmm3);
202 vpmaddwd(xmm6, xmm7, xmm6);
203 vpaddd(xmm15, xmm15, xmm6);
204 vmovdqu(xmm5, xword[BO - 0x70]);
205 prefetcht1(byte[AA - 0x80]);
206 vmovdqu(xmm0, xword[AO]);
207 vmovdqu(xmm1, xword[AO + 0x10]);
208 vmovdqu(xmm2, xword[AO + 0x20]);
209 vmovdqu(xmm3, xword[AO + 0x30]);
210 add(AA, 0x4);
211 add(AO, 0x80);
212 add(BO, 0x10);
213 sub(H, 0x1);
214 jg(labels[88], T_NEAR);
215 align(4);
216
217 L(labels[57]);
218 prefetcht0(byte[CO1 + 0x3c]);
219 prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
220 add(H, 0x8);
221 jle(labels[59], T_NEAR);
222 align(4);
223
224 L(labels[58]);
225 vpshufd(xmm4, xmm5, 0x0);
226 vpmaddubsw(xmm6, xmm4, xmm0);
227 vpmaddwd(xmm6, xmm7, xmm6);
228 vpaddd(xmm8, xmm8, xmm6);
229 prefetcht0(byte[AO + 0x180]);
230 vpmaddubsw(xmm6, xmm4, xmm1);
231 vpmaddwd(xmm6, xmm7, xmm6);
232 vpaddd(xmm10, xmm10, xmm6);
233 vpmaddubsw(xmm6, xmm4, xmm2);
234 vpmaddwd(xmm6, xmm7, xmm6);
235 vpaddd(xmm12, xmm12, xmm6);
236 vpmaddubsw(xmm6, xmm4, xmm3);
237 vpmaddwd(xmm6, xmm7, xmm6);
238 vpaddd(xmm14, xmm14, xmm6);
239 prefetcht0(byte[BO]);
240 vpshufd(xmm4, xmm5, 0x55);
241 vpmaddubsw(xmm6, xmm4, xmm0);
242 vpmaddwd(xmm6, xmm7, xmm6);
243 vpaddd(xmm9, xmm9, xmm6);
244 vpmaddubsw(xmm6, xmm4, xmm1);
245 vpmaddwd(xmm6, xmm7, xmm6);
246 vpaddd(xmm11, xmm11, xmm6);
247 vpmaddubsw(xmm6, xmm4, xmm2);
248 vpmaddwd(xmm6, xmm7, xmm6);
249 vpaddd(xmm13, xmm13, xmm6);
250 vpmaddubsw(xmm6, xmm4, xmm3);
251 vpmaddwd(xmm6, xmm7, xmm6);
252 vpaddd(xmm15, xmm15, xmm6);
253 vmovdqu(xmm0, xword[AO - 0x40]);
254 vmovdqu(xmm1, xword[AO - 0x30]);
255 vmovdqu(xmm2, xword[AO - 0x20]);
256 vmovdqu(xmm3, xword[AO - 0x10]);
257 prefetcht0(byte[AO + 0x1c0]);
258 vpshufd(xmm4, xmm5, 0xaa);
259 vpmaddubsw(xmm6, xmm4, xmm0);
260 vpmaddwd(xmm6, xmm7, xmm6);
261 vpaddd(xmm8, xmm8, xmm6);
262 vpmaddubsw(xmm6, xmm4, xmm1);
263 vpmaddwd(xmm6, xmm7, xmm6);
264 vpaddd(xmm10, xmm10, xmm6);
265 vpmaddubsw(xmm6, xmm4, xmm2);
266 vpmaddwd(xmm6, xmm7, xmm6);
267 vpaddd(xmm12, xmm12, xmm6);
268 vpmaddubsw(xmm6, xmm4, xmm3);
269 vpmaddwd(xmm6, xmm7, xmm6);
270 vpaddd(xmm14, xmm14, xmm6);
271 vpshufd(xmm4, xmm5, 0xff);
272 vpmaddubsw(xmm6, xmm4, xmm0);
273 vpmaddwd(xmm6, xmm7, xmm6);
274 vpaddd(xmm9, xmm9, xmm6);
275 vpmaddubsw(xmm6, xmm4, xmm1);
276 vpmaddwd(xmm6, xmm7, xmm6);
277 vpaddd(xmm11, xmm11, xmm6);
278 vpmaddubsw(xmm6, xmm4, xmm2);
279 vpmaddwd(xmm6, xmm7, xmm6);
280 vpaddd(xmm13, xmm13, xmm6);
281 vpmaddubsw(xmm6, xmm4, xmm3);
282 vpmaddwd(xmm6, xmm7, xmm6);
283 vpaddd(xmm15, xmm15, xmm6);
284 vmovdqu(xmm5, xword[BO - 0x70]);
285 prefetcht1(byte[AA - 0x80]);
286 vmovdqu(xmm0, xword[AO]);
287 vmovdqu(xmm1, xword[AO + 0x10]);
288 vmovdqu(xmm2, xword[AO + 0x20]);
289 vmovdqu(xmm3, xword[AO + 0x30]);
290 add(AA, 0x4);
291 add(AO, 0x80);
292 add(BO, 0x10);
293 sub(H, 0x1);
294 jg(labels[58], T_NEAR);
295 align(4);
296
297 L(labels[59]);
298 mov(H, K);
299 test(H, 0x4);
300 je(labels[60], T_NEAR);
301 vpshufd(xmm4, xmm5, 0x0);
302 vpmaddubsw(xmm6, xmm4, xmm0);
303 vpmaddwd(xmm6, xmm7, xmm6);
304 vpaddd(xmm8, xmm8, xmm6);
305 vpmaddubsw(xmm6, xmm4, xmm1);
306 vpmaddwd(xmm6, xmm7, xmm6);
307 vpaddd(xmm10, xmm10, xmm6);
308 vpmaddubsw(xmm6, xmm4, xmm2);
309 vpmaddwd(xmm6, xmm7, xmm6);
310 vpaddd(xmm12, xmm12, xmm6);
311 vpmaddubsw(xmm6, xmm4, xmm3);
312 vpmaddwd(xmm6, xmm7, xmm6);
313 vpaddd(xmm14, xmm14, xmm6);
314 vpshufd(xmm4, xmm5, 0x55);
315 vpmaddubsw(xmm6, xmm4, xmm0);
316 vpmaddwd(xmm6, xmm7, xmm6);
317 vpaddd(xmm9, xmm9, xmm6);
318 vpmaddubsw(xmm6, xmm4, xmm1);
319 vpmaddwd(xmm6, xmm7, xmm6);
320 vpaddd(xmm11, xmm11, xmm6);
321 vpmaddubsw(xmm6, xmm4, xmm2);
322 vpmaddwd(xmm6, xmm7, xmm6);
323 vpaddd(xmm13, xmm13, xmm6);
324 vpmaddubsw(xmm6, xmm4, xmm3);
325 vpmaddwd(xmm6, xmm7, xmm6);
326 vpaddd(xmm15, xmm15, xmm6);
327 add(AO, 0x40);
328 add(BO, 0x8);
329 align(4);
330
331 L(labels[60]);
332 mov(H, K);
333 test(H, 0x2);
334 je(labels[61], T_NEAR);
335 vxorps(xmm6, xmm6, xmm6);
336 vmovdqu(xmm1, xword[AO - 0x80]);
337 vpunpcklwd(xmm0, xmm1, xmm6);
338 vpunpckhwd(xmm1, xmm1, xmm6);
339 vmovdqu(xmm3, xword[AO - 0x70]);
340 vpunpcklwd(xmm2, xmm3, xmm6);
341 vpunpckhwd(xmm3, xmm3, xmm6);
342 vbroadcastss(xmm5, dword[BO - 0x80]);
343 vpunpcklwd(xmm5, xmm5, xmm5);
344 vpshufd(xmm4, xmm5, 0x0);
345 vpmaddubsw(xmm6, xmm4, xmm0);
346 vpmaddwd(xmm6, xmm7, xmm6);
347 vpaddd(xmm8, xmm8, xmm6);
348 vpmaddubsw(xmm6, xmm4, xmm1);
349 vpmaddwd(xmm6, xmm7, xmm6);
350 vpaddd(xmm10, xmm10, xmm6);
351 vpmaddubsw(xmm6, xmm4, xmm2);
352 vpmaddwd(xmm6, xmm7, xmm6);
353 vpaddd(xmm12, xmm12, xmm6);
354 vpmaddubsw(xmm6, xmm4, xmm3);
355 vpmaddwd(xmm6, xmm7, xmm6);
356 vpaddd(xmm14, xmm14, xmm6);
357 vpshufd(xmm4, xmm5, 0x55);
358 vpmaddubsw(xmm6, xmm4, xmm0);
359 vpmaddwd(xmm6, xmm7, xmm6);
360 vpaddd(xmm9, xmm9, xmm6);
361 vpmaddubsw(xmm6, xmm4, xmm1);
362 vpmaddwd(xmm6, xmm7, xmm6);
363 vpaddd(xmm11, xmm11, xmm6);
364 vpmaddubsw(xmm6, xmm4, xmm2);
365 vpmaddwd(xmm6, xmm7, xmm6);
366 vpaddd(xmm13, xmm13, xmm6);
367 vpmaddubsw(xmm6, xmm4, xmm3);
368 vpmaddwd(xmm6, xmm7, xmm6);
369 vpaddd(xmm15, xmm15, xmm6);
370 add(AO, 0x20);
371 add(BO, 0x4);
372 align(4);
373
374 L(labels[61]);
375 mov(H, K);
376 test(H, 0x1);
377 je(labels[62], T_NEAR);
378 vxorps(xmm6, xmm6, xmm6);
379 vbroadcastss(xmm0, dword[AO - 0x80]);
380 vpunpcklbw(xmm0, xmm0, xmm6);
381 vpunpcklwd(xmm0, xmm0, xmm6);
382 vbroadcastss(xmm1, dword[AO - 0x7c]);
383 vpunpcklbw(xmm1, xmm1, xmm6);
384 vpunpcklwd(xmm1, xmm1, xmm6);
385 vbroadcastss(xmm2, dword[AO - 0x78]);
386 vpunpcklbw(xmm2, xmm2, xmm6);
387 vpunpcklwd(xmm2, xmm2, xmm6);
388 vbroadcastss(xmm3, dword[AO - 0x74]);
389 vpunpcklbw(xmm3, xmm3, xmm6);
390 vpunpcklwd(xmm3, xmm3, xmm6);
391 vbroadcastss(xmm5, dword[BO - 0x80]);
392 vpunpcklbw(xmm5, xmm5, xmm5);
393 vpunpcklwd(xmm5, xmm5, xmm5);
394 vpshufd(xmm4, xmm5, 0x0);
395 vpmaddubsw(xmm6, xmm4, xmm0);
396 vpmaddwd(xmm6, xmm7, xmm6);
397 vpaddd(xmm8, xmm8, xmm6);
398 vpmaddubsw(xmm6, xmm4, xmm1);
399 vpmaddwd(xmm6, xmm7, xmm6);
400 vpaddd(xmm10, xmm10, xmm6);
401 vpmaddubsw(xmm6, xmm4, xmm2);
402 vpmaddwd(xmm6, xmm7, xmm6);
403 vpaddd(xmm12, xmm12, xmm6);
404 vpmaddubsw(xmm6, xmm4, xmm3);
405 vpmaddwd(xmm6, xmm7, xmm6);
406 vpaddd(xmm14, xmm14, xmm6);
407 vpshufd(xmm4, xmm5, 0x55);
408 vpmaddubsw(xmm6, xmm4, xmm0);
409 vpmaddwd(xmm6, xmm7, xmm6);
410 vpaddd(xmm9, xmm9, xmm6);
411 vpmaddubsw(xmm6, xmm4, xmm1);
412 vpmaddwd(xmm6, xmm7, xmm6);
413 vpaddd(xmm11, xmm11, xmm6);
414 vpmaddubsw(xmm6, xmm4, xmm2);
415 vpmaddwd(xmm6, xmm7, xmm6);
416 vpaddd(xmm13, xmm13, xmm6);
417 vpmaddubsw(xmm6, xmm4, xmm3);
418 vpmaddwd(xmm6, xmm7, xmm6);
419 vpaddd(xmm15, xmm15, xmm6);
420 add(AO, 0x10);
421 add(BO, 0x2);
422 align(4);
423
424 L(labels[62]);
425 vmovdqu(xmm0, xword[CO1]);
426 vpaddd(xmm8, xmm8, xmm0);
427 vmovdqu(xword[CO1], xmm8);
428 vxorps(xmm8, xmm8, xmm8);
429 vmovdqu(xmm0, xword[CO1 + 0x10]);
430 vpaddd(xmm10, xmm10, xmm0);
431 vmovdqu(xword[CO1 + 0x10], xmm10);
432 vxorps(xmm10, xmm10, xmm10);
433 vmovdqu(xmm0, xword[CO1 + 0x20]);
434 vpaddd(xmm12, xmm12, xmm0);
435 vmovdqu(xword[CO1 + 0x20], xmm12);
436 vxorps(xmm12, xmm12, xmm12);
437 vmovdqu(xmm0, xword[CO1 + 0x30]);
438 vpaddd(xmm14, xmm14, xmm0);
439 vmovdqu(xword[CO1 + 0x30], xmm14);
440 vxorps(xmm14, xmm14, xmm14);
441 vmovdqu(xmm0, xword[CO1 + LDC * 1]);
442 vpaddd(xmm9, xmm9, xmm0);
443 vmovdqu(xword[CO1 + LDC * 1], xmm9);
444 vxorps(xmm9, xmm9, xmm9);
445 vmovdqu(xmm0, xword[CO1 + LDC * 1 + 0x10]);
446 vpaddd(xmm11, xmm11, xmm0);
447 vmovdqu(xword[CO1 + LDC * 1 + 0x10], xmm11);
448 vxorps(xmm11, xmm11, xmm11);
449 vmovdqu(xmm0, xword[CO1 + LDC * 1 + 0x20]);
450 vpaddd(xmm13, xmm13, xmm0);
451 vmovdqu(xword[CO1 + LDC * 1 + 0x20], xmm13);
452 vxorps(xmm13, xmm13, xmm13);
453 vmovdqu(xmm0, xword[CO1 + LDC * 1 + 0x30]);
454 vpaddd(xmm15, xmm15, xmm0);
455 vmovdqu(xword[CO1 + LDC * 1 + 0x30], xmm15);
456 vxorps(xmm15, xmm15, xmm15);
457 lea(CO1, ptr[CO1 + LDC * 2]);
458 sub(I, 0x2);
459 cmp(I, 0x2);
460 jge(labels[78], T_NEAR);
461 align(4);
462
463 L(labels[63]);
464 test(I, 0x1);
465 jle(labels[72], T_NEAR);
466 mov(AO, A);
467 vmovdqu(xmm0, xword[AO - 0x80]);
468 vmovdqu(xmm1, xword[AO - 0x70]);
469 vmovdqu(xmm2, xword[AO - 0x60]);
470 vmovdqu(xmm3, xword[AO - 0x50]);
471 vmovdqu(xmm5, xword[BO - 0x80]);
472 mov(H, K);
473 sar(H, 0x3);
474 jle(labels[67], T_NEAR);
475 sub(H, 0x8);
476 jle(labels[65], T_NEAR);
477 align(4);
478
479 L(labels[64]);
480 vpshufd(xmm4, xmm5, 0x0);
481 vpmaddubsw(xmm6, xmm4, xmm0);
482 vpmaddwd(xmm6, xmm7, xmm6);
483 vpaddd(xmm8, xmm8, xmm6);
484 prefetcht0(byte[AO + 0x180]);
485 vpmaddubsw(xmm6, xmm4, xmm1);
486 vpmaddwd(xmm6, xmm7, xmm6);
487 vpaddd(xmm10, xmm10, xmm6);
488 vpmaddubsw(xmm6, xmm4, xmm2);
489 vpmaddwd(xmm6, xmm7, xmm6);
490 vpaddd(xmm12, xmm12, xmm6);
491 vpmaddubsw(xmm6, xmm4, xmm3);
492 vpmaddwd(xmm6, xmm7, xmm6);
493 vpaddd(xmm14, xmm14, xmm6);
494 prefetcht0(byte[BO]);
495 vmovdqu(xmm0, xword[AO - 0x40]);
496 vmovdqu(xmm1, xword[AO - 0x30]);
497 vmovdqu(xmm2, xword[AO - 0x20]);
498 vmovdqu(xmm3, xword[AO - 0x10]);
499 prefetcht0(byte[AO + 0x1c0]);
500 vpshufd(xmm4, xmm5, 0x55);
501 vpmaddubsw(xmm6, xmm4, xmm0);
502 vpmaddwd(xmm6, xmm7, xmm6);
503 vpaddd(xmm8, xmm8, xmm6);
504 vpmaddubsw(xmm6, xmm4, xmm1);
505 vpmaddwd(xmm6, xmm7, xmm6);
506 vpaddd(xmm10, xmm10, xmm6);
507 vpmaddubsw(xmm6, xmm4, xmm2);
508 vpmaddwd(xmm6, xmm7, xmm6);
509 vpaddd(xmm12, xmm12, xmm6);
510 vpmaddubsw(xmm6, xmm4, xmm3);
511 vpmaddwd(xmm6, xmm7, xmm6);
512 vpaddd(xmm14, xmm14, xmm6);
513 vmovdqu(xmm5, xword[BO - 0x78]);
514 prefetcht1(byte[AA - 0x80]);
515 vmovdqu(xmm0, xword[AO]);
516 vmovdqu(xmm1, xword[AO + 0x10]);
517 vmovdqu(xmm2, xword[AO + 0x20]);
518 vmovdqu(xmm3, xword[AO + 0x30]);
519 add(AA, 0x4);
520 add(AO, 0x80);
521 add(BO, 0x8);
522 sub(H, 0x1);
523 jg(labels[64], T_NEAR);
524 align(4);
525
526 L(labels[65]);
527 prefetcht0(byte[CO1 + 0x3c]);
528 add(H, 0x8);
529 jle(labels[67], T_NEAR);
530 align(4);
531
532 L(labels[66]);
533 vpshufd(xmm4, xmm5, 0x0);
534 vpmaddubsw(xmm6, xmm4, xmm0);
535 vpmaddwd(xmm6, xmm7, xmm6);
536 vpaddd(xmm8, xmm8, xmm6);
537 prefetcht0(byte[AO + 0x180]);
538 vpmaddubsw(xmm6, xmm4, xmm1);
539 vpmaddwd(xmm6, xmm7, xmm6);
540 vpaddd(xmm10, xmm10, xmm6);
541 vpmaddubsw(xmm6, xmm4, xmm2);
542 vpmaddwd(xmm6, xmm7, xmm6);
543 vpaddd(xmm12, xmm12, xmm6);
544 vpmaddubsw(xmm6, xmm4, xmm3);
545 vpmaddwd(xmm6, xmm7, xmm6);
546 vpaddd(xmm14, xmm14, xmm6);
547 prefetcht0(byte[BO]);
548 vmovdqu(xmm0, xword[AO - 0x40]);
549 vmovdqu(xmm1, xword[AO - 0x30]);
550 vmovdqu(xmm2, xword[AO - 0x20]);
551 vmovdqu(xmm3, xword[AO - 0x10]);
552 prefetcht0(byte[AO + 0x1c0]);
553 vpshufd(xmm4, xmm5, 0x55);
554 vpmaddubsw(xmm6, xmm4, xmm0);
555 vpmaddwd(xmm6, xmm7, xmm6);
556 vpaddd(xmm8, xmm8, xmm6);
557 vpmaddubsw(xmm6, xmm4, xmm1);
558 vpmaddwd(xmm6, xmm7, xmm6);
559 vpaddd(xmm10, xmm10, xmm6);
560 vpmaddubsw(xmm6, xmm4, xmm2);
561 vpmaddwd(xmm6, xmm7, xmm6);
562 vpaddd(xmm12, xmm12, xmm6);
563 vpmaddubsw(xmm6, xmm4, xmm3);
564 vpmaddwd(xmm6, xmm7, xmm6);
565 vpaddd(xmm14, xmm14, xmm6);
566 vmovdqu(xmm5, xword[BO - 0x78]);
567 prefetcht1(byte[AA - 0x80]);
568 vmovdqu(xmm0, xword[AO]);
569 vmovdqu(xmm1, xword[AO + 0x10]);
570 vmovdqu(xmm2, xword[AO + 0x20]);
571 vmovdqu(xmm3, xword[AO + 0x30]);
572 add(AA, 0x4);
573 add(AO, 0x80);
574 add(BO, 0x8);
575 sub(H, 0x1);
576 jg(labels[66], T_NEAR);
577 align(4);
578
579 L(labels[67]);
580 mov(H, K);
581 test(H, 0x4);
582 je(labels[68], T_NEAR);
583 vpshufd(xmm4, xmm5, 0x0);
584 vpmaddubsw(xmm6, xmm4, xmm0);
585 vpmaddwd(xmm6, xmm7, xmm6);
586 vpaddd(xmm8, xmm8, xmm6);
587 vpmaddubsw(xmm6, xmm4, xmm1);
588 vpmaddwd(xmm6, xmm7, xmm6);
589 vpaddd(xmm10, xmm10, xmm6);
590 vpmaddubsw(xmm6, xmm4, xmm2);
591 vpmaddwd(xmm6, xmm7, xmm6);
592 vpaddd(xmm12, xmm12, xmm6);
593 vpmaddubsw(xmm6, xmm4, xmm3);
594 vpmaddwd(xmm6, xmm7, xmm6);
595 vpaddd(xmm14, xmm14, xmm6);
596 add(AO, 0x40);
597 add(BO, 0x4);
598 align(4);
599
600 L(labels[68]);
601 mov(H, K);
602 test(H, 0x2);
603 je(labels[69], T_NEAR);
604 vxorps(xmm6, xmm6, xmm6);
605 vmovdqu(xmm1, xword[AO - 0x80]);
606 vpunpcklwd(xmm0, xmm1, xmm6);
607 vpunpckhwd(xmm1, xmm1, xmm6);
608 vmovdqu(xmm3, xword[AO - 0x70]);
609 vpunpcklwd(xmm2, xmm3, xmm6);
610 vpunpckhwd(xmm3, xmm3, xmm6);
611 vbroadcastss(xmm5, dword[BO - 0x80]);
612 vpunpcklwd(xmm5, xmm5, xmm5);
613 vpshufd(xmm4, xmm5, 0x0);
614 vpmaddubsw(xmm6, xmm4, xmm0);
615 vpmaddwd(xmm6, xmm7, xmm6);
616 vpaddd(xmm8, xmm8, xmm6);
617 vpmaddubsw(xmm6, xmm4, xmm1);
618 vpmaddwd(xmm6, xmm7, xmm6);
619 vpaddd(xmm10, xmm10, xmm6);
620 vpmaddubsw(xmm6, xmm4, xmm2);
621 vpmaddwd(xmm6, xmm7, xmm6);
622 vpaddd(xmm12, xmm12, xmm6);
623 vpmaddubsw(xmm6, xmm4, xmm3);
624 vpmaddwd(xmm6, xmm7, xmm6);
625 vpaddd(xmm14, xmm14, xmm6);
626 add(AO, 0x20);
627 add(BO, 0x2);
628 align(4);
629
630 L(labels[69]);
631 mov(H, K);
632 test(H, 0x1);
633 je(labels[71], T_NEAR);
634 vxorps(xmm6, xmm6, xmm6);
635 vbroadcastss(xmm0, dword[AO - 0x80]);
636 vpunpcklbw(xmm0, xmm0, xmm6);
637 vpunpcklwd(xmm0, xmm0, xmm6);
638 vbroadcastss(xmm1, dword[AO - 0x7c]);
639 vpunpcklbw(xmm1, xmm1, xmm6);
640 vpunpcklwd(xmm1, xmm1, xmm6);
641 vbroadcastss(xmm2, dword[AO - 0x78]);
642 vpunpcklbw(xmm2, xmm2, xmm6);
643 vpunpcklwd(xmm2, xmm2, xmm6);
644 vbroadcastss(xmm3, dword[AO - 0x74]);
645 vpunpcklbw(xmm3, xmm3, xmm6);
646 vpunpcklwd(xmm3, xmm3, xmm6);
647 vbroadcastss(xmm5, dword[BO - 0x80]);
648 vpunpcklbw(xmm5, xmm5, xmm5);
649 vpunpcklwd(xmm5, xmm5, xmm5);
650 vpshufd(xmm4, xmm5, 0x0);
651 vpmaddubsw(xmm6, xmm4, xmm0);
652 vpmaddwd(xmm6, xmm7, xmm6);
653 vpaddd(xmm8, xmm8, xmm6);
654 vpmaddubsw(xmm6, xmm4, xmm1);
655 vpmaddwd(xmm6, xmm7, xmm6);
656 vpaddd(xmm10, xmm10, xmm6);
657 vpmaddubsw(xmm6, xmm4, xmm2);
658 vpmaddwd(xmm6, xmm7, xmm6);
659 vpaddd(xmm12, xmm12, xmm6);
660 vpmaddubsw(xmm6, xmm4, xmm3);
661 vpmaddwd(xmm6, xmm7, xmm6);
662 vpaddd(xmm14, xmm14, xmm6);
663 add(AO, 0x10);
664 add(BO, 0x1);
665 align(4);
666
667 L(labels[71]);
668 vmovdqu(xmm0, xword[CO1]);
669 vpaddd(xmm8, xmm8, xmm0);
670 vmovdqu(xword[CO1], xmm8);
671 vxorps(xmm8, xmm8, xmm8);
672 vmovdqu(xmm0, xword[CO1 + 0x10]);
673 vpaddd(xmm10, xmm10, xmm0);
674 vmovdqu(xword[CO1 + 0x10], xmm10);
675 vxorps(xmm10, xmm10, xmm10);
676 vmovdqu(xmm0, xword[CO1 + 0x20]);
677 vpaddd(xmm12, xmm12, xmm0);
678 vmovdqu(xword[CO1 + 0x20], xmm12);
679 vxorps(xmm12, xmm12, xmm12);
680 vmovdqu(xmm0, xword[CO1 + 0x30]);
681 vpaddd(xmm14, xmm14, xmm0);
682 vmovdqu(xword[CO1 + 0x30], xmm14);
683 vxorps(xmm14, xmm14, xmm14);
684 lea(CO1, ptr[CO1 + LDC * 1]);
685 align(4);
686
687 L(labels[72]);
688 mov(A, AO);
689 sub(J, 0x10);
690 cmp(J, 0x10);
691 jge(labels[70], T_NEAR);
692 align(4);
693
694 L(labels[73]);
695 test(J, 0x8);
696 jle(labels[2], T_NEAR);
697 mov(CO1, C);
698 add(C, 0x20);
699 mov(BO, B);
700 mov(AA, K);
701 shl(AA, 0x10);
702 lea(AA, ptr[A + AA * 1 + 0x200]);
703 mov(I, N);
704 cmp(I, 0x2);
705 jl(labels[83], T_NEAR);
706 align(4);
707
708 L(labels[74]);
709 mov(AO, A);
710 vmovdqu(xmm0, xword[AO - 0x80]);
711 vmovdqu(xmm1, xword[AO - 0x70]);
712 vmovdqu(xmm5, xword[BO - 0x80]);
713 mov(H, K);
714 sar(H, 0x3);
715 jle(labels[79], T_NEAR);
716 sub(H, 0x8);
717 jle(labels[76], T_NEAR);
718 align(4);
719
720 L(labels[75]);
721 vpshufd(xmm4, xmm5, 0x0);
722 vpmaddubsw(xmm6, xmm4, xmm0);
723 vpmaddwd(xmm6, xmm7, xmm6);
724 vpaddd(xmm8, xmm8, xmm6);
725 prefetcht0(byte[AO + 0x180]);
726 vpmaddubsw(xmm6, xmm4, xmm1);
727 vpmaddwd(xmm6, xmm7, xmm6);
728 vpaddd(xmm10, xmm10, xmm6);
729 prefetcht0(byte[BO]);
730 vpshufd(xmm4, xmm5, 0x55);
731 vpmaddubsw(xmm6, xmm4, xmm0);
732 vpmaddwd(xmm6, xmm7, xmm6);
733 vpaddd(xmm9, xmm9, xmm6);
734 vpmaddubsw(xmm6, xmm4, xmm1);
735 vpmaddwd(xmm6, xmm7, xmm6);
736 vpaddd(xmm11, xmm11, xmm6);
737 vmovdqu(xmm0, xword[AO - 0x60]);
738 vmovdqu(xmm1, xword[AO - 0x50]);
739 prefetcht0(byte[AO + 0x1c0]);
740 vpshufd(xmm4, xmm5, 0xaa);
741 vpmaddubsw(xmm6, xmm4, xmm0);
742 vpmaddwd(xmm6, xmm7, xmm6);
743 vpaddd(xmm8, xmm8, xmm6);
744 vpmaddubsw(xmm6, xmm4, xmm1);
745 vpmaddwd(xmm6, xmm7, xmm6);
746 vpaddd(xmm10, xmm10, xmm6);
747 vpshufd(xmm4, xmm5, 0xff);
748 vpmaddubsw(xmm6, xmm4, xmm0);
749 vpmaddwd(xmm6, xmm7, xmm6);
750 vpaddd(xmm9, xmm9, xmm6);
751 vpmaddubsw(xmm6, xmm4, xmm1);
752 vpmaddwd(xmm6, xmm7, xmm6);
753 vpaddd(xmm11, xmm11, xmm6);
754 vmovdqu(xmm5, xword[BO - 0x70]);
755 prefetcht1(byte[AA - 0x80]);
756 vmovdqu(xmm0, xword[AO - 0x40]);
757 vmovdqu(xmm1, xword[AO - 0x30]);
758 add(AA, 0x4);
759 add(AO, 0x40);
760 add(BO, 0x10);
761 sub(H, 0x1);
762 jg(labels[75], T_NEAR);
763 align(4);
764
765 L(labels[76]);
766 prefetcht0(byte[CO1 + 0x3c]);
767 prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
768 add(H, 0x8);
769 jle(labels[79], T_NEAR);
770 align(4);
771
772 L(labels[77]);
773 vpshufd(xmm4, xmm5, 0x0);
774 vpmaddubsw(xmm6, xmm4, xmm0);
775 vpmaddwd(xmm6, xmm7, xmm6);
776 vpaddd(xmm8, xmm8, xmm6);
777 prefetcht0(byte[AO + 0x180]);
778 vpmaddubsw(xmm6, xmm4, xmm1);
779 vpmaddwd(xmm6, xmm7, xmm6);
780 vpaddd(xmm10, xmm10, xmm6);
781 prefetcht0(byte[BO]);
782 vpshufd(xmm4, xmm5, 0x55);
783 vpmaddubsw(xmm6, xmm4, xmm0);
784 vpmaddwd(xmm6, xmm7, xmm6);
785 vpaddd(xmm9, xmm9, xmm6);
786 vpmaddubsw(xmm6, xmm4, xmm1);
787 vpmaddwd(xmm6, xmm7, xmm6);
788 vpaddd(xmm11, xmm11, xmm6);
789 vmovdqu(xmm0, xword[AO - 0x60]);
790 vmovdqu(xmm1, xword[AO - 0x50]);
791 prefetcht0(byte[AO + 0x1c0]);
792 vpshufd(xmm4, xmm5, 0xaa);
793 vpmaddubsw(xmm6, xmm4, xmm0);
794 vpmaddwd(xmm6, xmm7, xmm6);
795 vpaddd(xmm8, xmm8, xmm6);
796 vpmaddubsw(xmm6, xmm4, xmm1);
797 vpmaddwd(xmm6, xmm7, xmm6);
798 vpaddd(xmm10, xmm10, xmm6);
799 vpshufd(xmm4, xmm5, 0xff);
800 vpmaddubsw(xmm6, xmm4, xmm0);
801 vpmaddwd(xmm6, xmm7, xmm6);
802 vpaddd(xmm9, xmm9, xmm6);
803 vpmaddubsw(xmm6, xmm4, xmm1);
804 vpmaddwd(xmm6, xmm7, xmm6);
805 vpaddd(xmm11, xmm11, xmm6);
806 vmovdqu(xmm5, xword[BO - 0x70]);
807 prefetcht1(byte[AA - 0x80]);
808 vmovdqu(xmm0, xword[AO - 0x40]);
809 vmovdqu(xmm1, xword[AO - 0x30]);
810 add(AA, 0x4);
811 add(AO, 0x40);
812 add(BO, 0x10);
813 sub(H, 0x1);
814 jg(labels[77], T_NEAR);
815 align(4);
816
817 L(labels[79]);
818 mov(H, K);
819 test(H, 0x4);
820 je(labels[80], T_NEAR);
821 vpshufd(xmm4, xmm5, 0x0);
822 vpmaddubsw(xmm6, xmm4, xmm0);
823 vpmaddwd(xmm6, xmm7, xmm6);
824 vpaddd(xmm8, xmm8, xmm6);
825 vpmaddubsw(xmm6, xmm4, xmm1);
826 vpmaddwd(xmm6, xmm7, xmm6);
827 vpaddd(xmm10, xmm10, xmm6);
828 vpshufd(xmm4, xmm5, 0x55);
829 vpmaddubsw(xmm6, xmm4, xmm0);
830 vpmaddwd(xmm6, xmm7, xmm6);
831 vpaddd(xmm9, xmm9, xmm6);
832 vpmaddubsw(xmm6, xmm4, xmm1);
833 vpmaddwd(xmm6, xmm7, xmm6);
834 vpaddd(xmm11, xmm11, xmm6);
835 add(AO, 0x20);
836 add(BO, 0x8);
837 align(4);
838
839 L(labels[80]);
840 mov(H, K);
841 test(H, 0x2);
842 je(labels[81], T_NEAR);
843 vxorps(xmm6, xmm6, xmm6);
844 vmovdqu(xmm1, xword[AO - 0x80]);
845 vpunpcklwd(xmm0, xmm1, xmm6);
846 vpunpckhwd(xmm1, xmm1, xmm6);
847 vbroadcastss(xmm5, dword[BO - 0x80]);
848 vpunpcklwd(xmm5, xmm5, xmm5);
849 vpshufd(xmm4, xmm5, 0x0);
850 vpmaddubsw(xmm6, xmm4, xmm0);
851 vpmaddwd(xmm6, xmm7, xmm6);
852 vpaddd(xmm8, xmm8, xmm6);
853 vpmaddubsw(xmm6, xmm4, xmm1);
854 vpmaddwd(xmm6, xmm7, xmm6);
855 vpaddd(xmm10, xmm10, xmm6);
856 vpshufd(xmm4, xmm5, 0x55);
857 vpmaddubsw(xmm6, xmm4, xmm0);
858 vpmaddwd(xmm6, xmm7, xmm6);
859 vpaddd(xmm9, xmm9, xmm6);
860 vpmaddubsw(xmm6, xmm4, xmm1);
861 vpmaddwd(xmm6, xmm7, xmm6);
862 vpaddd(xmm11, xmm11, xmm6);
863 add(AO, 0x10);
864 add(BO, 0x4);
865 align(4);
866
867 L(labels[81]);
868 mov(H, K);
869 test(H, 0x1);
870 je(labels[82], T_NEAR);
871 vxorps(xmm6, xmm6, xmm6);
872 vbroadcastss(xmm0, dword[AO - 0x80]);
873 vpunpcklbw(xmm0, xmm0, xmm6);
874 vpunpcklwd(xmm0, xmm0, xmm6);
875 vbroadcastss(xmm1, dword[AO - 0x7c]);
876 vpunpcklbw(xmm1, xmm1, xmm6);
877 vpunpcklwd(xmm1, xmm1, xmm6);
878 vbroadcastss(xmm5, dword[BO - 0x80]);
879 vpunpcklbw(xmm5, xmm5, xmm5);
880 vpunpcklwd(xmm5, xmm5, xmm5);
881 vpshufd(xmm4, xmm5, 0x0);
882 vpmaddubsw(xmm6, xmm4, xmm0);
883 vpmaddwd(xmm6, xmm7, xmm6);
884 vpaddd(xmm8, xmm8, xmm6);
885 vpmaddubsw(xmm6, xmm4, xmm1);
886 vpmaddwd(xmm6, xmm7, xmm6);
887 vpaddd(xmm10, xmm10, xmm6);
888 vpshufd(xmm4, xmm5, 0x55);
889 vpmaddubsw(xmm6, xmm4, xmm0);
890 vpmaddwd(xmm6, xmm7, xmm6);
891 vpaddd(xmm9, xmm9, xmm6);
892 vpmaddubsw(xmm6, xmm4, xmm1);
893 vpmaddwd(xmm6, xmm7, xmm6);
894 vpaddd(xmm11, xmm11, xmm6);
895 add(AO, 0x8);
896 add(BO, 0x2);
897 align(4);
898
899 L(labels[82]);
900 vmovdqu(xmm0, xword[CO1]);
901 vpaddd(xmm8, xmm8, xmm0);
902 vmovdqu(xword[CO1], xmm8);
903 vxorps(xmm8, xmm8, xmm8);
904 vmovdqu(xmm0, xword[CO1 + 0x10]);
905 vpaddd(xmm10, xmm10, xmm0);
906 vmovdqu(xword[CO1 + 0x10], xmm10);
907 vxorps(xmm10, xmm10, xmm10);
908 vmovdqu(xmm0, xword[CO1 + LDC * 1]);
909 vpaddd(xmm9, xmm9, xmm0);
910 vmovdqu(xword[CO1 + LDC * 1], xmm9);
911 vxorps(xmm9, xmm9, xmm9);
912 vmovdqu(xmm0, xword[CO1 + LDC * 1 + 0x10]);
913 vpaddd(xmm11, xmm11, xmm0);
914 vmovdqu(xword[CO1 + LDC * 1 + 0x10], xmm11);
915 vxorps(xmm11, xmm11, xmm11);
916 lea(CO1, ptr[CO1 + LDC * 2]);
917 sub(I, 0x2);
918 cmp(I, 0x2);
919 jge(labels[74], T_NEAR);
920 align(4);
921
922 L(labels[83]);
923 test(I, 0x1);
924 jle(labels[1], T_NEAR);
925 mov(AO, A);
926 vmovdqu(xmm0, xword[AO - 0x80]);
927 vmovdqu(xmm1, xword[AO - 0x70]);
928 vmovdqu(xmm5, xword[BO - 0x80]);
929 mov(H, K);
930 sar(H, 0x3);
931 jle(labels[87], T_NEAR);
932 sub(H, 0x8);
933 jle(labels[85], T_NEAR);
934 align(4);
935
936 L(labels[84]);
937 vpshufd(xmm4, xmm5, 0x0);
938 vpmaddubsw(xmm6, xmm4, xmm0);
939 vpmaddwd(xmm6, xmm7, xmm6);
940 vpaddd(xmm8, xmm8, xmm6);
941 prefetcht0(byte[AO + 0x180]);
942 vpmaddubsw(xmm6, xmm4, xmm1);
943 vpmaddwd(xmm6, xmm7, xmm6);
944 vpaddd(xmm10, xmm10, xmm6);
945 prefetcht0(byte[BO]);
946 vmovdqu(xmm0, xword[AO - 0x60]);
947 vmovdqu(xmm1, xword[AO - 0x50]);
948 prefetcht0(byte[AO + 0x1c0]);
949 vpshufd(xmm4, xmm5, 0x55);
950 vpmaddubsw(xmm6, xmm4, xmm0);
951 vpmaddwd(xmm6, xmm7, xmm6);
952 vpaddd(xmm8, xmm8, xmm6);
953 vpmaddubsw(xmm6, xmm4, xmm1);
954 vpmaddwd(xmm6, xmm7, xmm6);
955 vpaddd(xmm10, xmm10, xmm6);
956 vmovdqu(xmm5, xword[BO - 0x78]);
957 prefetcht1(byte[AA - 0x80]);
958 vmovdqu(xmm0, xword[AO - 0x40]);
959 vmovdqu(xmm1, xword[AO - 0x30]);
960 add(AA, 0x4);
961 add(AO, 0x40);
962 add(BO, 0x8);
963 sub(H, 0x1);
964 jg(labels[84], T_NEAR);
965 align(4);
966
967 L(labels[85]);
968 prefetcht0(byte[CO1 + 0x3c]);
969 add(H, 0x8);
970 jle(labels[87], T_NEAR);
971 align(4);
972
973 L(labels[86]);
974 vpshufd(xmm4, xmm5, 0x0);
975 vpmaddubsw(xmm6, xmm4, xmm0);
976 vpmaddwd(xmm6, xmm7, xmm6);
977 vpaddd(xmm8, xmm8, xmm6);
978 prefetcht0(byte[AO + 0x180]);
979 vpmaddubsw(xmm6, xmm4, xmm1);
980 vpmaddwd(xmm6, xmm7, xmm6);
981 vpaddd(xmm10, xmm10, xmm6);
982 prefetcht0(byte[BO]);
983 vmovdqu(xmm0, xword[AO - 0x60]);
984 vmovdqu(xmm1, xword[AO - 0x50]);
985 prefetcht0(byte[AO + 0x1c0]);
986 vpshufd(xmm4, xmm5, 0x55);
987 vpmaddubsw(xmm6, xmm4, xmm0);
988 vpmaddwd(xmm6, xmm7, xmm6);
989 vpaddd(xmm8, xmm8, xmm6);
990 vpmaddubsw(xmm6, xmm4, xmm1);
991 vpmaddwd(xmm6, xmm7, xmm6);
992 vpaddd(xmm10, xmm10, xmm6);
993 vmovdqu(xmm5, xword[BO - 0x78]);
994 prefetcht1(byte[AA - 0x80]);
995 vmovdqu(xmm0, xword[AO - 0x40]);
996 vmovdqu(xmm1, xword[AO - 0x30]);
997 add(AA, 0x4);
998 add(AO, 0x40);
999 add(BO, 0x8);
1000 sub(H, 0x1);
1001 jg(labels[86], T_NEAR);
1002 align(4);
1003
1004 L(labels[87]);
1005 mov(H, K);
1006 test(H, 0x4);
1007 je(labels[89], T_NEAR);
1008 vpshufd(xmm4, xmm5, 0x0);
1009 vpmaddubsw(xmm6, xmm4, xmm0);
1010 vpmaddwd(xmm6, xmm7, xmm6);
1011 vpaddd(xmm8, xmm8, xmm6);
1012 vpmaddubsw(xmm6, xmm4, xmm1);
1013 vpmaddwd(xmm6, xmm7, xmm6);
1014 vpaddd(xmm10, xmm10, xmm6);
1015 add(AO, 0x20);
1016 add(BO, 0x4);
1017 align(4);
1018
1019 L(labels[89]);
1020 mov(H, K);
1021 test(H, 0x2);
1022 je(labels[90], T_NEAR);
1023 vxorps(xmm6, xmm6, xmm6);
1024 vmovdqu(xmm1, xword[AO - 0x80]);
1025 vpunpcklwd(xmm0, xmm1, xmm6);
1026 vpunpckhwd(xmm1, xmm1, xmm6);
1027 vbroadcastss(xmm5, dword[BO - 0x80]);
1028 vpunpcklwd(xmm5, xmm5, xmm5);
1029 vpshufd(xmm4, xmm5, 0x0);
1030 vpmaddubsw(xmm6, xmm4, xmm0);
1031 vpmaddwd(xmm6, xmm7, xmm6);
1032 vpaddd(xmm8, xmm8, xmm6);
1033 vpmaddubsw(xmm6, xmm4, xmm1);
1034 vpmaddwd(xmm6, xmm7, xmm6);
1035 vpaddd(xmm10, xmm10, xmm6);
1036 add(AO, 0x10);
1037 add(BO, 0x2);
1038 align(4);
1039
1040 L(labels[90]);
1041 mov(H, K);
1042 test(H, 0x1);
1043 je(labels[0], T_NEAR);
1044 vxorps(xmm6, xmm6, xmm6);
1045 vbroadcastss(xmm0, dword[AO - 0x80]);
1046 vpunpcklbw(xmm0, xmm0, xmm6);
1047 vpunpcklwd(xmm0, xmm0, xmm6);
1048 vbroadcastss(xmm1, dword[AO - 0x7c]);
1049 vpunpcklbw(xmm1, xmm1, xmm6);
1050 vpunpcklwd(xmm1, xmm1, xmm6);
1051 vbroadcastss(xmm5, dword[BO - 0x80]);
1052 vpunpcklbw(xmm5, xmm5, xmm5);
1053 vpunpcklwd(xmm5, xmm5, xmm5);
1054 vpshufd(xmm4, xmm5, 0x0);
1055 vpmaddubsw(xmm6, xmm4, xmm0);
1056 vpmaddwd(xmm6, xmm7, xmm6);
1057 vpaddd(xmm8, xmm8, xmm6);
1058 vpmaddubsw(xmm6, xmm4, xmm1);
1059 vpmaddwd(xmm6, xmm7, xmm6);
1060 vpaddd(xmm10, xmm10, xmm6);
1061 add(AO, 0x8);
1062 add(BO, 0x1);
1063 align(4);
1064
1065 L(labels[0]);
1066 vmovdqu(xmm0, xword[CO1]);
1067 vpaddd(xmm8, xmm8, xmm0);
1068 vmovdqu(xword[CO1], xmm8);
1069 vxorps(xmm8, xmm8, xmm8);
1070 vmovdqu(xmm0, xword[CO1 + 0x10]);
1071 vpaddd(xmm10, xmm10, xmm0);
1072 vmovdqu(xword[CO1 + 0x10], xmm10);
1073 vxorps(xmm10, xmm10, xmm10);
1074 lea(CO1, ptr[CO1 + LDC * 1]);
1075 align(4);
1076
1077 L(labels[1]);
1078 mov(A, AO);
1079 align(4);
1080
1081 L(labels[2]);
1082 test(J, 0x4);
1083 jle(labels[20], T_NEAR);
1084 mov(CO1, C);
1085 add(C, 0x10);
1086 mov(BO, B);
1087 mov(AA, K);
1088 shl(AA, 0x8);
1089 lea(AA, ptr[A + AA * 1 + 0x200]);
1090 mov(I, N);
1091 cmp(I, 0x2);
1092 jl(labels[11], T_NEAR);
1093 align(4);
1094
1095 L(labels[3]);
1096 mov(AO, A);
1097 vmovdqu(xmm0, xword[AO - 0x80]);
1098 vmovdqu(xmm5, xword[BO - 0x80]);
1099 mov(H, K);
1100 sar(H, 0x3);
1101 jle(labels[7], T_NEAR);
1102 sub(H, 0x8);
1103 jle(labels[5], T_NEAR);
1104 align(4);
1105
1106 L(labels[4]);
1107 vpshufd(xmm4, xmm5, 0x0);
1108 vpmaddubsw(xmm6, xmm4, xmm0);
1109 vpmaddwd(xmm6, xmm7, xmm6);
1110 vpaddd(xmm8, xmm8, xmm6);
1111 prefetcht0(byte[AO + 0x180]);
1112 prefetcht0(byte[BO]);
1113 vpshufd(xmm4, xmm5, 0x55);
1114 vpmaddubsw(xmm6, xmm4, xmm0);
1115 vpmaddwd(xmm6, xmm7, xmm6);
1116 vpaddd(xmm9, xmm9, xmm6);
1117 vmovdqu(xmm0, xword[AO - 0x70]);
1118 prefetcht0(byte[AO + 0x1c0]);
1119 vpshufd(xmm4, xmm5, 0xaa);
1120 vpmaddubsw(xmm6, xmm4, xmm0);
1121 vpmaddwd(xmm6, xmm7, xmm6);
1122 vpaddd(xmm8, xmm8, xmm6);
1123 vpshufd(xmm4, xmm5, 0xff);
1124 vpmaddubsw(xmm6, xmm4, xmm0);
1125 vpmaddwd(xmm6, xmm7, xmm6);
1126 vpaddd(xmm9, xmm9, xmm6);
1127 vmovdqu(xmm5, xword[BO - 0x70]);
1128 prefetcht1(byte[AA - 0x80]);
1129 vmovdqu(xmm0, xword[AO - 0x60]);
1130 add(AA, 0x4);
1131 add(AO, 0x20);
1132 add(BO, 0x10);
1133 sub(H, 0x1);
1134 jg(labels[4], T_NEAR);
1135 align(4);
1136
1137 L(labels[5]);
1138 prefetcht0(byte[CO1 + 0x3c]);
1139 prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
1140 add(H, 0x8);
1141 jle(labels[7], T_NEAR);
1142 align(4);
1143
1144 L(labels[6]);
1145 vpshufd(xmm4, xmm5, 0x0);
1146 vpmaddubsw(xmm6, xmm4, xmm0);
1147 vpmaddwd(xmm6, xmm7, xmm6);
1148 vpaddd(xmm8, xmm8, xmm6);
1149 prefetcht0(byte[AO + 0x180]);
1150 prefetcht0(byte[BO]);
1151 vpshufd(xmm4, xmm5, 0x55);
1152 vpmaddubsw(xmm6, xmm4, xmm0);
1153 vpmaddwd(xmm6, xmm7, xmm6);
1154 vpaddd(xmm9, xmm9, xmm6);
1155 vmovdqu(xmm0, xword[AO - 0x70]);
1156 prefetcht0(byte[AO + 0x1c0]);
1157 vpshufd(xmm4, xmm5, 0xaa);
1158 vpmaddubsw(xmm6, xmm4, xmm0);
1159 vpmaddwd(xmm6, xmm7, xmm6);
1160 vpaddd(xmm8, xmm8, xmm6);
1161 vpshufd(xmm4, xmm5, 0xff);
1162 vpmaddubsw(xmm6, xmm4, xmm0);
1163 vpmaddwd(xmm6, xmm7, xmm6);
1164 vpaddd(xmm9, xmm9, xmm6);
1165 vmovdqu(xmm5, xword[BO - 0x70]);
1166 prefetcht1(byte[AA - 0x80]);
1167 vmovdqu(xmm0, xword[AO - 0x60]);
1168 add(AA, 0x4);
1169 add(AO, 0x20);
1170 add(BO, 0x10);
1171 sub(H, 0x1);
1172 jg(labels[6], T_NEAR);
1173 align(4);
1174
1175 L(labels[7]);
1176 mov(H, K);
1177 test(H, 0x4);
1178 je(labels[8], T_NEAR);
1179 vpshufd(xmm4, xmm5, 0x0);
1180 vpmaddubsw(xmm6, xmm4, xmm0);
1181 vpmaddwd(xmm6, xmm7, xmm6);
1182 vpaddd(xmm8, xmm8, xmm6);
1183 vpshufd(xmm4, xmm5, 0x55);
1184 vpmaddubsw(xmm6, xmm4, xmm0);
1185 vpmaddwd(xmm6, xmm7, xmm6);
1186 vpaddd(xmm9, xmm9, xmm6);
1187 add(AO, 0x10);
1188 add(BO, 0x8);
1189 align(4);
1190
1191 L(labels[8]);
1192 mov(H, K);
1193 test(H, 0x2);
1194 je(labels[9], T_NEAR);
1195 vxorps(xmm6, xmm6, xmm6);
1196 vmovdqu(xmm1, xword[AO - 0x80]);
1197 vpunpcklwd(xmm0, xmm1, xmm6);
1198 vbroadcastss(xmm5, dword[BO - 0x80]);
1199 vpunpcklwd(xmm5, xmm5, xmm5);
1200 vpshufd(xmm4, xmm5, 0x0);
1201 vpmaddubsw(xmm6, xmm4, xmm0);
1202 vpmaddwd(xmm6, xmm7, xmm6);
1203 vpaddd(xmm8, xmm8, xmm6);
1204 vpshufd(xmm4, xmm5, 0x55);
1205 vpmaddubsw(xmm6, xmm4, xmm0);
1206 vpmaddwd(xmm6, xmm7, xmm6);
1207 vpaddd(xmm9, xmm9, xmm6);
1208 add(AO, 0x8);
1209 add(BO, 0x4);
1210 align(4);
1211
1212 L(labels[9]);
1213 mov(H, K);
1214 test(H, 0x1);
1215 je(labels[10], T_NEAR);
1216 vxorps(xmm6, xmm6, xmm6);
1217 vbroadcastss(xmm0, dword[AO - 0x80]);
1218 vpunpcklbw(xmm0, xmm0, xmm6);
1219 vpunpcklwd(xmm0, xmm0, xmm6);
1220 vbroadcastss(xmm5, dword[BO - 0x80]);
1221 vpunpcklbw(xmm5, xmm5, xmm5);
1222 vpunpcklwd(xmm5, xmm5, xmm5);
1223 vpshufd(xmm4, xmm5, 0x0);
1224 vpmaddubsw(xmm6, xmm4, xmm0);
1225 vpmaddwd(xmm6, xmm7, xmm6);
1226 vpaddd(xmm8, xmm8, xmm6);
1227 vpshufd(xmm4, xmm5, 0x55);
1228 vpmaddubsw(xmm6, xmm4, xmm0);
1229 vpmaddwd(xmm6, xmm7, xmm6);
1230 vpaddd(xmm9, xmm9, xmm6);
1231 add(AO, 0x4);
1232 add(BO, 0x2);
1233 align(4);
1234
1235 L(labels[10]);
1236 vmovdqu(xmm0, xword[CO1]);
1237 vpaddd(xmm8, xmm8, xmm0);
1238 vmovdqu(xword[CO1], xmm8);
1239 vxorps(xmm8, xmm8, xmm8);
1240 vmovdqu(xmm0, xword[CO1 + LDC * 1]);
1241 vpaddd(xmm9, xmm9, xmm0);
1242 vmovdqu(xword[CO1 + LDC * 1], xmm9);
1243 vxorps(xmm9, xmm9, xmm9);
1244 lea(CO1, ptr[CO1 + LDC * 2]);
1245 sub(I, 0x2);
1246 cmp(I, 0x2);
1247 jge(labels[3], T_NEAR);
1248 align(4);
1249
1250 L(labels[11]);
1251 test(I, 0x1);
1252 jle(labels[19], T_NEAR);
1253 mov(AO, A);
1254 vmovdqu(xmm0, xword[AO - 0x80]);
1255 vmovdqu(xmm5, xword[BO - 0x80]);
1256 mov(H, K);
1257 sar(H, 0x3);
1258 jle(labels[15], T_NEAR);
1259 sub(H, 0x8);
1260 jle(labels[13], T_NEAR);
1261 align(4);
1262
1263 L(labels[12]);
1264 vpshufd(xmm4, xmm5, 0x0);
1265 vpmaddubsw(xmm6, xmm4, xmm0);
1266 vpmaddwd(xmm6, xmm7, xmm6);
1267 vpaddd(xmm8, xmm8, xmm6);
1268 prefetcht0(byte[AO + 0x180]);
1269 prefetcht0(byte[BO]);
1270 vmovdqu(xmm0, xword[AO - 0x70]);
1271 prefetcht0(byte[AO + 0x1c0]);
1272 vpshufd(xmm4, xmm5, 0x55);
1273 vpmaddubsw(xmm6, xmm4, xmm0);
1274 vpmaddwd(xmm6, xmm7, xmm6);
1275 vpaddd(xmm8, xmm8, xmm6);
1276 vmovdqu(xmm5, xword[BO - 0x78]);
1277 prefetcht1(byte[AA - 0x80]);
1278 vmovdqu(xmm0, xword[AO - 0x60]);
1279 add(AA, 0x4);
1280 add(AO, 0x20);
1281 add(BO, 0x8);
1282 sub(H, 0x1);
1283 jg(labels[12], T_NEAR);
1284 align(4);
1285
1286 L(labels[13]);
1287 prefetcht0(byte[CO1 + 0x3c]);
1288 add(H, 0x8);
1289 jle(labels[15], T_NEAR);
1290 align(4);
1291
1292 L(labels[14]);
1293 vpshufd(xmm4, xmm5, 0x0);
1294 vpmaddubsw(xmm6, xmm4, xmm0);
1295 vpmaddwd(xmm6, xmm7, xmm6);
1296 vpaddd(xmm8, xmm8, xmm6);
1297 prefetcht0(byte[AO + 0x180]);
1298 prefetcht0(byte[BO]);
1299 vmovdqu(xmm0, xword[AO - 0x70]);
1300 prefetcht0(byte[AO + 0x1c0]);
1301 vpshufd(xmm4, xmm5, 0x55);
1302 vpmaddubsw(xmm6, xmm4, xmm0);
1303 vpmaddwd(xmm6, xmm7, xmm6);
1304 vpaddd(xmm8, xmm8, xmm6);
1305 vmovdqu(xmm5, xword[BO - 0x78]);
1306 prefetcht1(byte[AA - 0x80]);
1307 vmovdqu(xmm0, xword[AO - 0x60]);
1308 add(AA, 0x4);
1309 add(AO, 0x20);
1310 add(BO, 0x8);
1311 sub(H, 0x1);
1312 jg(labels[14], T_NEAR);
1313 align(4);
1314
1315 L(labels[15]);
1316 mov(H, K);
1317 test(H, 0x4);
1318 je(labels[16], T_NEAR);
1319 vpshufd(xmm4, xmm5, 0x0);
1320 vpmaddubsw(xmm6, xmm4, xmm0);
1321 vpmaddwd(xmm6, xmm7, xmm6);
1322 vpaddd(xmm8, xmm8, xmm6);
1323 add(AO, 0x10);
1324 add(BO, 0x4);
1325 align(4);
1326
1327 L(labels[16]);
1328 mov(H, K);
1329 test(H, 0x2);
1330 je(labels[17], T_NEAR);
1331 vxorps(xmm6, xmm6, xmm6);
1332 vmovdqu(xmm1, xword[AO - 0x80]);
1333 vpunpcklwd(xmm0, xmm1, xmm6);
1334 vbroadcastss(xmm5, dword[BO - 0x80]);
1335 vpunpcklwd(xmm5, xmm5, xmm5);
1336 vpshufd(xmm4, xmm5, 0x0);
1337 vpmaddubsw(xmm6, xmm4, xmm0);
1338 vpmaddwd(xmm6, xmm7, xmm6);
1339 vpaddd(xmm8, xmm8, xmm6);
1340 add(AO, 0x8);
1341 add(BO, 0x2);
1342 align(4);
1343
1344 L(labels[17]);
1345 mov(H, K);
1346 test(H, 0x1);
1347 je(labels[18], T_NEAR);
1348 vxorps(xmm6, xmm6, xmm6);
1349 vbroadcastss(xmm0, dword[AO - 0x80]);
1350 vpunpcklbw(xmm0, xmm0, xmm6);
1351 vpunpcklwd(xmm0, xmm0, xmm6);
1352 vbroadcastss(xmm5, dword[BO - 0x80]);
1353 vpunpcklbw(xmm5, xmm5, xmm5);
1354 vpunpcklwd(xmm5, xmm5, xmm5);
1355 vpshufd(xmm4, xmm5, 0x0);
1356 vpmaddubsw(xmm6, xmm4, xmm0);
1357 vpmaddwd(xmm6, xmm7, xmm6);
1358 vpaddd(xmm8, xmm8, xmm6);
1359 add(AO, 0x4);
1360 add(BO, 0x1);
1361 align(4);
1362
1363 L(labels[18]);
1364 vmovdqu(xmm0, xword[CO1]);
1365 vpaddd(xmm8, xmm8, xmm0);
1366 vmovdqu(xword[CO1], xmm8);
1367 vxorps(xmm8, xmm8, xmm8);
1368 lea(CO1, ptr[CO1 + LDC * 1]);
1369 align(4);
1370
1371 L(labels[19]);
1372 mov(A, AO);
1373 align(4);
1374
1375 L(labels[20]);
1376 test(J, 0x2);
1377 jle(labels[38], T_NEAR);
1378 mov(CO1, C);
1379 add(C, 0x8);
1380 mov(BO, B);
1381 mov(AA, K);
1382 shl(AA, 0x4);
1383 lea(AA, ptr[A + AA * 1 + 0x200]);
1384 mov(I, N);
1385 cmp(I, 0x2);
1386 jl(labels[29], T_NEAR);
1387 align(4);
1388
1389 L(labels[21]);
1390 mov(AO, A);
1391 vmovdqu(xmm0, xword[AO - 0x80]);
1392 vmovdqu(xmm5, xword[BO - 0x80]);
1393 mov(H, K);
1394 sar(H, 0x3);
1395 jle(labels[25], T_NEAR);
1396 sub(H, 0x8);
1397 jle(labels[23], T_NEAR);
1398 align(4);
1399
1400 L(labels[22]);
1401 vpshufd(xmm4, xmm5, 0x0);
1402 vpmaddubsw(xmm6, xmm4, xmm0);
1403 vpmaddwd(xmm6, xmm7, xmm6);
1404 vpaddd(xmm8, xmm8, xmm6);
1405 prefetcht0(byte[AO + 0x180]);
1406 prefetcht0(byte[BO]);
1407 vpshufd(xmm4, xmm5, 0x55);
1408 vpmaddubsw(xmm6, xmm4, xmm0);
1409 vpmaddwd(xmm6, xmm7, xmm6);
1410 vpaddd(xmm9, xmm9, xmm6);
1411 vmovdqu(xmm0, xword[AO - 0x78]);
1412 prefetcht0(byte[AO + 0x1c0]);
1413 vpshufd(xmm4, xmm5, 0xaa);
1414 vpmaddubsw(xmm6, xmm4, xmm0);
1415 vpmaddwd(xmm6, xmm7, xmm6);
1416 vpaddd(xmm8, xmm8, xmm6);
1417 vpshufd(xmm4, xmm5, 0xff);
1418 vpmaddubsw(xmm6, xmm4, xmm0);
1419 vpmaddwd(xmm6, xmm7, xmm6);
1420 vpaddd(xmm9, xmm9, xmm6);
1421 vmovdqu(xmm5, xword[BO - 0x70]);
1422 prefetcht1(byte[AA - 0x80]);
1423 vmovdqu(xmm0, xword[AO - 0x70]);
1424 add(AA, 0x4);
1425 add(AO, 0x10);
1426 add(BO, 0x10);
1427 sub(H, 0x1);
1428 jg(labels[22], T_NEAR);
1429 align(4);
1430
1431 L(labels[23]);
1432 prefetcht0(byte[CO1 + 0x3c]);
1433 prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
1434 add(H, 0x8);
1435 jle(labels[25], T_NEAR);
1436 align(4);
1437
1438 L(labels[24]);
1439 vpshufd(xmm4, xmm5, 0x0);
1440 vpmaddubsw(xmm6, xmm4, xmm0);
1441 vpmaddwd(xmm6, xmm7, xmm6);
1442 vpaddd(xmm8, xmm8, xmm6);
1443 prefetcht0(byte[AO + 0x180]);
1444 prefetcht0(byte[BO]);
1445 vpshufd(xmm4, xmm5, 0x55);
1446 vpmaddubsw(xmm6, xmm4, xmm0);
1447 vpmaddwd(xmm6, xmm7, xmm6);
1448 vpaddd(xmm9, xmm9, xmm6);
1449 vmovdqu(xmm0, xword[AO - 0x78]);
1450 prefetcht0(byte[AO + 0x1c0]);
1451 vpshufd(xmm4, xmm5, 0xaa);
1452 vpmaddubsw(xmm6, xmm4, xmm0);
1453 vpmaddwd(xmm6, xmm7, xmm6);
1454 vpaddd(xmm8, xmm8, xmm6);
1455 vpshufd(xmm4, xmm5, 0xff);
1456 vpmaddubsw(xmm6, xmm4, xmm0);
1457 vpmaddwd(xmm6, xmm7, xmm6);
1458 vpaddd(xmm9, xmm9, xmm6);
1459 vmovdqu(xmm5, xword[BO - 0x70]);
1460 prefetcht1(byte[AA - 0x80]);
1461 vmovdqu(xmm0, xword[AO - 0x70]);
1462 add(AA, 0x4);
1463 add(AO, 0x10);
1464 add(BO, 0x10);
1465 sub(H, 0x1);
1466 jg(labels[24], T_NEAR);
1467 align(4);
1468
1469 L(labels[25]);
1470 mov(H, K);
1471 test(H, 0x4);
1472 je(labels[26], T_NEAR);
1473 vpshufd(xmm4, xmm5, 0x0);
1474 vpmaddubsw(xmm6, xmm4, xmm0);
1475 vpmaddwd(xmm6, xmm7, xmm6);
1476 vpaddd(xmm8, xmm8, xmm6);
1477 vpshufd(xmm4, xmm5, 0x55);
1478 vpmaddubsw(xmm6, xmm4, xmm0);
1479 vpmaddwd(xmm6, xmm7, xmm6);
1480 vpaddd(xmm9, xmm9, xmm6);
1481 add(AO, 0x8);
1482 add(BO, 0x8);
1483 align(4);
1484
1485 L(labels[26]);
1486 mov(H, K);
1487 test(H, 0x2);
1488 je(labels[27], T_NEAR);
1489 vxorps(xmm6, xmm6, xmm6);
1490 vmovdqu(xmm1, xword[AO - 0x80]);
1491 vpunpcklwd(xmm0, xmm1, xmm6);
1492 vbroadcastss(xmm5, dword[BO - 0x80]);
1493 vpunpcklwd(xmm5, xmm5, xmm5);
1494 vpshufd(xmm4, xmm5, 0x0);
1495 vpmaddubsw(xmm6, xmm4, xmm0);
1496 vpmaddwd(xmm6, xmm7, xmm6);
1497 vpaddd(xmm8, xmm8, xmm6);
1498 vpshufd(xmm4, xmm5, 0x55);
1499 vpmaddubsw(xmm6, xmm4, xmm0);
1500 vpmaddwd(xmm6, xmm7, xmm6);
1501 vpaddd(xmm9, xmm9, xmm6);
1502 add(AO, 0x4);
1503 add(BO, 0x4);
1504 align(4);
1505
1506 L(labels[27]);
1507 mov(H, K);
1508 test(H, 0x1);
1509 je(labels[28], T_NEAR);
1510 vxorps(xmm6, xmm6, xmm6);
1511 vbroadcastss(xmm0, dword[AO - 0x80]);
1512 vpunpcklbw(xmm0, xmm0, xmm6);
1513 vpunpcklwd(xmm0, xmm0, xmm6);
1514 vbroadcastss(xmm5, dword[BO - 0x80]);
1515 vpunpcklbw(xmm5, xmm5, xmm5);
1516 vpunpcklwd(xmm5, xmm5, xmm5);
1517 vpshufd(xmm4, xmm5, 0x0);
1518 vpmaddubsw(xmm6, xmm4, xmm0);
1519 vpmaddwd(xmm6, xmm7, xmm6);
1520 vpaddd(xmm8, xmm8, xmm6);
1521 vpshufd(xmm4, xmm5, 0x55);
1522 vpmaddubsw(xmm6, xmm4, xmm0);
1523 vpmaddwd(xmm6, xmm7, xmm6);
1524 vpaddd(xmm9, xmm9, xmm6);
1525 add(AO, 0x2);
1526 add(BO, 0x2);
1527 align(4);
1528
1529 L(labels[28]);
1530 vmovsd(xmm0, qword[CO1]);
1531 vpaddd(xmm8, xmm8, xmm0);
1532 vmovlps(qword[CO1], xmm8);
1533 vxorps(xmm8, xmm8, xmm8);
1534 vmovsd(xmm0, qword[CO1 + LDC * 1]);
1535 vpaddd(xmm9, xmm9, xmm0);
1536 vmovlps(qword[CO1 + LDC * 1], xmm9);
1537 vxorps(xmm9, xmm9, xmm9);
1538 lea(CO1, ptr[CO1 + LDC * 2]);
1539 sub(I, 0x2);
1540 cmp(I, 0x2);
1541 jge(labels[21], T_NEAR);
1542 align(4);
1543
1544 L(labels[29]);
1545 test(I, 0x1);
1546 jle(labels[37], T_NEAR);
1547 mov(AO, A);
1548 vmovdqu(xmm0, xword[AO - 0x80]);
1549 vmovdqu(xmm5, xword[BO - 0x80]);
1550 mov(H, K);
1551 sar(H, 0x3);
1552 jle(labels[33], T_NEAR);
1553 sub(H, 0x8);
1554 jle(labels[31], T_NEAR);
1555 align(4);
1556
1557 L(labels[30]);
1558 vpshufd(xmm4, xmm5, 0x0);
1559 vpmaddubsw(xmm6, xmm4, xmm0);
1560 vpmaddwd(xmm6, xmm7, xmm6);
1561 vpaddd(xmm8, xmm8, xmm6);
1562 prefetcht0(byte[AO + 0x180]);
1563 prefetcht0(byte[BO]);
1564 vmovdqu(xmm0, xword[AO - 0x78]);
1565 prefetcht0(byte[AO + 0x1c0]);
1566 vpshufd(xmm4, xmm5, 0x55);
1567 vpmaddubsw(xmm6, xmm4, xmm0);
1568 vpmaddwd(xmm6, xmm7, xmm6);
1569 vpaddd(xmm8, xmm8, xmm6);
1570 vmovdqu(xmm5, xword[BO - 0x78]);
1571 prefetcht1(byte[AA - 0x80]);
1572 vmovdqu(xmm0, xword[AO - 0x70]);
1573 add(AA, 0x4);
1574 add(AO, 0x10);
1575 add(BO, 0x8);
1576 sub(H, 0x1);
1577 jg(labels[30], T_NEAR);
1578 align(4);
1579
1580 L(labels[31]);
1581 prefetcht0(byte[CO1 + 0x3c]);
1582 add(H, 0x8);
1583 jle(labels[33], T_NEAR);
1584 align(4);
1585
1586 L(labels[32]);
1587 vpshufd(xmm4, xmm5, 0x0);
1588 vpmaddubsw(xmm6, xmm4, xmm0);
1589 vpmaddwd(xmm6, xmm7, xmm6);
1590 vpaddd(xmm8, xmm8, xmm6);
1591 prefetcht0(byte[AO + 0x180]);
1592 prefetcht0(byte[BO]);
1593 vmovdqu(xmm0, xword[AO - 0x78]);
1594 prefetcht0(byte[AO + 0x1c0]);
1595 vpshufd(xmm4, xmm5, 0x55);
1596 vpmaddubsw(xmm6, xmm4, xmm0);
1597 vpmaddwd(xmm6, xmm7, xmm6);
1598 vpaddd(xmm8, xmm8, xmm6);
1599 vmovdqu(xmm5, xword[BO - 0x78]);
1600 prefetcht1(byte[AA - 0x80]);
1601 vmovdqu(xmm0, xword[AO - 0x70]);
1602 add(AA, 0x4);
1603 add(AO, 0x10);
1604 add(BO, 0x8);
1605 sub(H, 0x1);
1606 jg(labels[32], T_NEAR);
1607 align(4);
1608
1609 L(labels[33]);
1610 mov(H, K);
1611 test(H, 0x4);
1612 je(labels[34], T_NEAR);
1613 vpshufd(xmm4, xmm5, 0x0);
1614 vpmaddubsw(xmm6, xmm4, xmm0);
1615 vpmaddwd(xmm6, xmm7, xmm6);
1616 vpaddd(xmm8, xmm8, xmm6);
1617 add(AO, 0x8);
1618 add(BO, 0x4);
1619 align(4);
1620
1621 L(labels[34]);
1622 mov(H, K);
1623 test(H, 0x2);
1624 je(labels[35], T_NEAR);
1625 vxorps(xmm6, xmm6, xmm6);
1626 vmovdqu(xmm1, xword[AO - 0x80]);
1627 vpunpcklwd(xmm0, xmm1, xmm6);
1628 vbroadcastss(xmm5, dword[BO - 0x80]);
1629 vpunpcklwd(xmm5, xmm5, xmm5);
1630 vpshufd(xmm4, xmm5, 0x0);
1631 vpmaddubsw(xmm6, xmm4, xmm0);
1632 vpmaddwd(xmm6, xmm7, xmm6);
1633 vpaddd(xmm8, xmm8, xmm6);
1634 add(AO, 0x4);
1635 add(BO, 0x2);
1636 align(4);
1637
1638 L(labels[35]);
1639 mov(H, K);
1640 test(H, 0x1);
1641 je(labels[36], T_NEAR);
1642 vxorps(xmm6, xmm6, xmm6);
1643 vbroadcastss(xmm0, dword[AO - 0x80]);
1644 vpunpcklbw(xmm0, xmm0, xmm6);
1645 vpunpcklwd(xmm0, xmm0, xmm6);
1646 vbroadcastss(xmm5, dword[BO - 0x80]);
1647 vpunpcklbw(xmm5, xmm5, xmm5);
1648 vpunpcklwd(xmm5, xmm5, xmm5);
1649 vpshufd(xmm4, xmm5, 0x0);
1650 vpmaddubsw(xmm6, xmm4, xmm0);
1651 vpmaddwd(xmm6, xmm7, xmm6);
1652 vpaddd(xmm8, xmm8, xmm6);
1653 add(AO, 0x2);
1654 add(BO, 0x1);
1655 align(4);
1656
1657 L(labels[36]);
1658 vmovsd(xmm0, qword[CO1]);
1659 vpaddd(xmm8, xmm8, xmm0);
1660 vmovlps(qword[CO1], xmm8);
1661 vxorps(xmm8, xmm8, xmm8);
1662 lea(CO1, ptr[CO1 + LDC * 1]);
1663 align(4);
1664
1665 L(labels[37]);
1666 mov(A, AO);
1667 align(4);
1668
1669 L(labels[38]);
1670 test(J, 0x1);
1671 jle(labels[56], T_NEAR);
1672 mov(CO1, C);
1673 add(C, 0x4);
1674 mov(BO, B);
1675 mov(AA, K);
1676 shl(AA, 0x2);
1677 lea(AA, ptr[A + AA * 1 + 0x200]);
1678 mov(I, N);
1679 cmp(I, 0x2);
1680 jl(labels[47], T_NEAR);
1681 align(4);
1682
1683 L(labels[39]);
1684 mov(AO, A);
1685 vmovdqu(xmm0, xword[AO - 0x80]);
1686 vmovdqu(xmm5, xword[BO - 0x80]);
1687 mov(H, K);
1688 sar(H, 0x3);
1689 jle(labels[43], T_NEAR);
1690 sub(H, 0x8);
1691 jle(labels[41], T_NEAR);
1692 align(4);
1693
1694 L(labels[40]);
1695 vpshufd(xmm4, xmm5, 0x0);
1696 vpmaddubsw(xmm6, xmm4, xmm0);
1697 vpmaddwd(xmm6, xmm7, xmm6);
1698 vpaddd(xmm8, xmm8, xmm6);
1699 prefetcht0(byte[AO + 0x180]);
1700 prefetcht0(byte[BO]);
1701 vpshufd(xmm4, xmm5, 0x55);
1702 vpmaddubsw(xmm6, xmm4, xmm0);
1703 vpmaddwd(xmm6, xmm7, xmm6);
1704 vpaddd(xmm9, xmm9, xmm6);
1705 vmovdqu(xmm0, xword[AO - 0x7c]);
1706 prefetcht0(byte[AO + 0x1c0]);
1707 vpshufd(xmm4, xmm5, 0xaa);
1708 vpmaddubsw(xmm6, xmm4, xmm0);
1709 vpmaddwd(xmm6, xmm7, xmm6);
1710 vpaddd(xmm8, xmm8, xmm6);
1711 vpshufd(xmm4, xmm5, 0xff);
1712 vpmaddubsw(xmm6, xmm4, xmm0);
1713 vpmaddwd(xmm6, xmm7, xmm6);
1714 vpaddd(xmm9, xmm9, xmm6);
1715 vmovdqu(xmm5, xword[BO - 0x70]);
1716 prefetcht1(byte[AA - 0x80]);
1717 vmovdqu(xmm0, xword[AO - 0x78]);
1718 add(AA, 0x4);
1719 add(AO, 0x8);
1720 add(BO, 0x10);
1721 sub(H, 0x1);
1722 jg(labels[40], T_NEAR);
1723 align(4);
1724
1725 L(labels[41]);
1726 prefetcht0(byte[CO1 + 0x3c]);
1727 prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
1728 add(H, 0x8);
1729 jle(labels[43], T_NEAR);
1730 align(4);
1731
1732 L(labels[42]);
1733 vpshufd(xmm4, xmm5, 0x0);
1734 vpmaddubsw(xmm6, xmm4, xmm0);
1735 vpmaddwd(xmm6, xmm7, xmm6);
1736 vpaddd(xmm8, xmm8, xmm6);
1737 prefetcht0(byte[AO + 0x180]);
1738 prefetcht0(byte[BO]);
1739 vpshufd(xmm4, xmm5, 0x55);
1740 vpmaddubsw(xmm6, xmm4, xmm0);
1741 vpmaddwd(xmm6, xmm7, xmm6);
1742 vpaddd(xmm9, xmm9, xmm6);
1743 vmovdqu(xmm0, xword[AO - 0x7c]);
1744 prefetcht0(byte[AO + 0x1c0]);
1745 vpshufd(xmm4, xmm5, 0xaa);
1746 vpmaddubsw(xmm6, xmm4, xmm0);
1747 vpmaddwd(xmm6, xmm7, xmm6);
1748 vpaddd(xmm8, xmm8, xmm6);
1749 vpshufd(xmm4, xmm5, 0xff);
1750 vpmaddubsw(xmm6, xmm4, xmm0);
1751 vpmaddwd(xmm6, xmm7, xmm6);
1752 vpaddd(xmm9, xmm9, xmm6);
1753 vmovdqu(xmm5, xword[BO - 0x70]);
1754 prefetcht1(byte[AA - 0x80]);
1755 vmovdqu(xmm0, xword[AO - 0x78]);
1756 add(AA, 0x4);
1757 add(AO, 0x8);
1758 add(BO, 0x10);
1759 sub(H, 0x1);
1760 jg(labels[42], T_NEAR);
1761 align(4);
1762
1763 L(labels[43]);
1764 mov(H, K);
1765 test(H, 0x4);
1766 je(labels[44], T_NEAR);
1767 vpshufd(xmm4, xmm5, 0x0);
1768 vpmaddubsw(xmm6, xmm4, xmm0);
1769 vpmaddwd(xmm6, xmm7, xmm6);
1770 vpaddd(xmm8, xmm8, xmm6);
1771 vpshufd(xmm4, xmm5, 0x55);
1772 vpmaddubsw(xmm6, xmm4, xmm0);
1773 vpmaddwd(xmm6, xmm7, xmm6);
1774 vpaddd(xmm9, xmm9, xmm6);
1775 add(AO, 0x4);
1776 add(BO, 0x8);
1777 align(4);
1778
1779 L(labels[44]);
1780 mov(H, K);
1781 test(H, 0x2);
1782 je(labels[45], T_NEAR);
1783 vxorps(xmm6, xmm6, xmm6);
1784 vmovdqu(xmm1, xword[AO - 0x80]);
1785 vpunpcklwd(xmm0, xmm1, xmm6);
1786 vbroadcastss(xmm5, dword[BO - 0x80]);
1787 vpunpcklwd(xmm5, xmm5, xmm5);
1788 vpshufd(xmm4, xmm5, 0x0);
1789 vpmaddubsw(xmm6, xmm4, xmm0);
1790 vpmaddwd(xmm6, xmm7, xmm6);
1791 vpaddd(xmm8, xmm8, xmm6);
1792 vpshufd(xmm4, xmm5, 0x55);
1793 vpmaddubsw(xmm6, xmm4, xmm0);
1794 vpmaddwd(xmm6, xmm7, xmm6);
1795 vpaddd(xmm9, xmm9, xmm6);
1796 add(AO, 0x2);
1797 add(BO, 0x4);
1798 align(4);
1799
1800 L(labels[45]);
1801 mov(H, K);
1802 test(H, 0x1);
1803 je(labels[46], T_NEAR);
1804 vxorps(xmm6, xmm6, xmm6);
1805 vbroadcastss(xmm0, dword[AO - 0x80]);
1806 vpunpcklbw(xmm0, xmm0, xmm6);
1807 vpunpcklwd(xmm0, xmm0, xmm6);
1808 vbroadcastss(xmm5, dword[BO - 0x80]);
1809 vpunpcklbw(xmm5, xmm5, xmm5);
1810 vpunpcklwd(xmm5, xmm5, xmm5);
1811 vpshufd(xmm4, xmm5, 0x0);
1812 vpmaddubsw(xmm6, xmm4, xmm0);
1813 vpmaddwd(xmm6, xmm7, xmm6);
1814 vpaddd(xmm8, xmm8, xmm6);
1815 vpshufd(xmm4, xmm5, 0x55);
1816 vpmaddubsw(xmm6, xmm4, xmm0);
1817 vpmaddwd(xmm6, xmm7, xmm6);
1818 vpaddd(xmm9, xmm9, xmm6);
1819 add(AO, 0x1);
1820 add(BO, 0x2);
1821 align(4);
1822
1823 L(labels[46]);
1824 vmovss(xmm0, dword[CO1]);
1825 vpaddd(xmm8, xmm8, xmm0);
1826 vmovss(dword[CO1], xmm8);
1827 vxorps(xmm8, xmm8, xmm8);
1828 vmovss(xmm0, dword[CO1 + LDC * 1]);
1829 vpaddd(xmm9, xmm9, xmm0);
1830 vmovss(dword[CO1 + LDC * 1], xmm9);
1831 vxorps(xmm9, xmm9, xmm9);
1832 lea(CO1, ptr[CO1 + LDC * 2]);
1833 sub(I, 0x2);
1834 cmp(I, 0x2);
1835 jge(labels[39], T_NEAR);
1836 align(4);
1837
1838 L(labels[47]);
1839 test(I, 0x1);
1840 jle(labels[55], T_NEAR);
1841 mov(AO, A);
1842 vmovdqu(xmm0, xword[AO - 0x80]);
1843 vmovdqu(xmm5, xword[BO - 0x80]);
1844 mov(H, K);
1845 sar(H, 0x3);
1846 jle(labels[51], T_NEAR);
1847 sub(H, 0x8);
1848 jle(labels[49], T_NEAR);
1849 align(4);
1850
1851 L(labels[48]);
1852 vpshufd(xmm4, xmm5, 0x0);
1853 vpmaddubsw(xmm6, xmm4, xmm0);
1854 vpmaddwd(xmm6, xmm7, xmm6);
1855 vpaddd(xmm8, xmm8, xmm6);
1856 prefetcht0(byte[AO + 0x180]);
1857 prefetcht0(byte[BO]);
1858 vmovdqu(xmm0, xword[AO - 0x7c]);
1859 prefetcht0(byte[AO + 0x1c0]);
1860 vpshufd(xmm4, xmm5, 0x55);
1861 vpmaddubsw(xmm6, xmm4, xmm0);
1862 vpmaddwd(xmm6, xmm7, xmm6);
1863 vpaddd(xmm8, xmm8, xmm6);
1864 vmovdqu(xmm5, xword[BO - 0x78]);
1865 prefetcht1(byte[AA - 0x80]);
1866 vmovdqu(xmm0, xword[AO - 0x78]);
1867 add(AA, 0x4);
1868 add(AO, 0x8);
1869 add(BO, 0x8);
1870 sub(H, 0x1);
1871 jg(labels[48], T_NEAR);
1872 align(4);
1873
1874 L(labels[49]);
1875 prefetcht0(byte[CO1 + 0x3c]);
1876 add(H, 0x8);
1877 jle(labels[51], T_NEAR);
1878 align(4);
1879
1880 L(labels[50]);
1881 vpshufd(xmm4, xmm5, 0x0);
1882 vpmaddubsw(xmm6, xmm4, xmm0);
1883 vpmaddwd(xmm6, xmm7, xmm6);
1884 vpaddd(xmm8, xmm8, xmm6);
1885 prefetcht0(byte[AO + 0x180]);
1886 prefetcht0(byte[BO]);
1887 vmovdqu(xmm0, xword[AO - 0x7c]);
1888 prefetcht0(byte[AO + 0x1c0]);
1889 vpshufd(xmm4, xmm5, 0x55);
1890 vpmaddubsw(xmm6, xmm4, xmm0);
1891 vpmaddwd(xmm6, xmm7, xmm6);
1892 vpaddd(xmm8, xmm8, xmm6);
1893 vmovdqu(xmm5, xword[BO - 0x78]);
1894 prefetcht1(byte[AA - 0x80]);
1895 vmovdqu(xmm0, xword[AO - 0x78]);
1896 add(AA, 0x4);
1897 add(AO, 0x8);
1898 add(BO, 0x8);
1899 sub(H, 0x1);
1900 jg(labels[50], T_NEAR);
1901 align(4);
1902
1903 L(labels[51]);
1904 mov(H, K);
1905 test(H, 0x4);
1906 je(labels[52], T_NEAR);
1907 vpshufd(xmm4, xmm5, 0x0);
1908 vpmaddubsw(xmm6, xmm4, xmm0);
1909 vpmaddwd(xmm6, xmm7, xmm6);
1910 vpaddd(xmm8, xmm8, xmm6);
1911 add(AO, 0x4);
1912 add(BO, 0x4);
1913 align(4);
1914
1915 L(labels[52]);
1916 mov(H, K);
1917 test(H, 0x2);
1918 je(labels[53], T_NEAR);
1919 vxorps(xmm6, xmm6, xmm6);
1920 vmovdqu(xmm1, xword[AO - 0x80]);
1921 vpunpcklwd(xmm0, xmm1, xmm6);
1922 vbroadcastss(xmm5, dword[BO - 0x80]);
1923 vpunpcklwd(xmm5, xmm5, xmm5);
1924 vpshufd(xmm4, xmm5, 0x0);
1925 vpmaddubsw(xmm6, xmm4, xmm0);
1926 vpmaddwd(xmm6, xmm7, xmm6);
1927 vpaddd(xmm8, xmm8, xmm6);
1928 add(AO, 0x2);
1929 add(BO, 0x2);
1930 align(4);
1931
1932 L(labels[53]);
1933 mov(H, K);
1934 test(H, 0x1);
1935 je(labels[54], T_NEAR);
1936 vxorps(xmm6, xmm6, xmm6);
1937 vbroadcastss(xmm0, dword[AO - 0x80]);
1938 vpunpcklbw(xmm0, xmm0, xmm6);
1939 vpunpcklwd(xmm0, xmm0, xmm6);
1940 vbroadcastss(xmm5, dword[BO - 0x80]);
1941 vpunpcklbw(xmm5, xmm5, xmm5);
1942 vpunpcklwd(xmm5, xmm5, xmm5);
1943 vpshufd(xmm4, xmm5, 0x0);
1944 vpmaddubsw(xmm6, xmm4, xmm0);
1945 vpmaddwd(xmm6, xmm7, xmm6);
1946 vpaddd(xmm8, xmm8, xmm6);
1947 add(AO, 0x1);
1948 add(BO, 0x1);
1949 align(4);
1950
1951 L(labels[54]);
1952 vmovss(xmm0, dword[CO1]);
1953 vpaddd(xmm8, xmm8, xmm0);
1954 vmovss(dword[CO1], xmm8);
1955 vxorps(xmm8, xmm8, xmm8);
1956 lea(CO1, ptr[CO1 + LDC * 1]);
1957 align(4);
1958
1959 L(labels[55]);
1960 mov(A, AO);
1961 align(4);
1962
1963 L(labels[56]);
1964 add(rsp, stack_alloc_size);
1965 postamble();
1966 }
1967 outLocalLabel();
1968
1969 #undef M
1970 #undef N
1971 #undef K
1972 #undef A
1973 #undef B
1974 #undef C
1975 #undef LDC
1976 #undef AA
1977 #undef I
1978 #undef J
1979 #undef H
1980 #undef AO
1981 #undef BO
1982 #undef CO1
1983 #undef CO2
1984 #ifdef _WIN32
1985 #undef ARG_A
1986 #undef ARG_B
1987 #endif
1988 #undef ARG_C
1989 #undef ARG_LDC
1990 }
1991
1992 } // namespace x64
1993 } // namespace cpu
1994 } // namespace impl
1995 } // namespace dnnl
1996