1 /*******************************************************************************
2 * Copyright 2020-2021 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16
17 #include "cpu/x64/jit_generator.hpp"
18
19 #include "cpu/x64/gemm/s8x8s32/common_u8.hpp"
20
21 namespace dnnl {
22 namespace impl {
23 namespace cpu {
24 namespace x64 {
25
jit_sse41_kernel_b0_gemm_s8u8s32_kern()26 jit_sse41_kernel_b0_gemm_s8u8s32_kern::jit_sse41_kernel_b0_gemm_s8u8s32_kern()
27 : jit_generator(nullptr, S8U8S32_COMPUTE_KERNEL_CODE_SIZE) {}
28
generate()29 void jit_sse41_kernel_b0_gemm_s8u8s32_kern::generate() {
30
31 #ifndef _WIN32
32
33 #define M rdi
34 #define N rsi
35 #define K rdx
36 #define A r8
37 #define B r9
38 #define C r10
39 #define LDC r11
40
41 #define AA rcx
42 #define I r12
43 #define J r13
44 #define H rax
45 #define AO r14
46 #define BO r15
47 #define CO1 rbx
48 #define CO2 rbp
49
50 #else
51
52 #define M rcx
53 #define N rdx
54 #define K r8
55 #define A rsi
56 #define B r9
57 #define C r10
58 #define LDC r11
59
60 #define AA rdi
61 #define I r12
62 #define J r13
63 #define H rax
64 #define AO r14
65 #define BO r15
66 #define CO1 rbx
67 #define CO2 rbp
68
69 #endif
70
71 #ifdef _WIN32
72 #define ARG_A (args_offset - 16) + rsp
73 #define ARG_B (args_offset - 8) + rsp
74 #endif
75 #define ARG_C ((args_offset + 0) + rsp)
76 #define ARG_LDC ((args_offset + 8) + rsp)
77
78 inLocalLabel();
79 {
80 std::vector<Xbyak::Label> labels(91);
81
82 auto stack_alloc_size = 32;
83 auto args_offset = stack_alloc_size + get_size_of_abi_save_regs() + 8;
84 #ifdef _WIN32
85 args_offset += 48;
86 #endif
87 preamble();
88 sub(rsp, stack_alloc_size);
89 #ifdef _WIN32
90 mov(A, ptr[ARG_A]);
91 mov(B, ptr[ARG_B]);
92 #endif
93
94 mov(C, qword[ARG_C]);
95 mov(LDC, qword[ARG_LDC]);
96 sub(A, -128);
97 sub(B, -128);
98 mov(M, qword[M]);
99 mov(N, qword[N]);
100 mov(K, qword[K]);
101 lea(LDC, ptr[LDC * 4 + 0x0]);
102 xorps(xmm8, xmm8);
103 xorps(xmm9, xmm9);
104 xorps(xmm10, xmm10);
105 xorps(xmm11, xmm11);
106 xorps(xmm12, xmm12);
107 xorps(xmm13, xmm13);
108 xorps(xmm14, xmm14);
109 xorps(xmm15, xmm15);
110 mov(H, 0x10001);
111 movq(xmm7, H);
112 pshufd(xmm7, xmm7, 0x0);
113 mov(J, M);
114 cmp(J, 0x10);
115 jl(labels[75], T_NEAR);
116 align(4);
117
118 L(labels[69]);
119 mov(CO1, C);
120 add(C, 0x40);
121 mov(BO, B);
122 mov(AA, K);
123 shl(AA, 0x20);
124 lea(AA, ptr[A + AA * 1 + 0x200]);
125 mov(I, N);
126 cmp(I, 0x2);
127 jl(labels[65], T_NEAR);
128 align(4);
129
130 L(labels[78]);
131 mov(AO, A);
132 movdqu(xmm0, xword[AO - 0x80]);
133 movdqu(xmm1, xword[AO - 0x70]);
134 movdqu(xmm2, xword[AO - 0x60]);
135 movdqu(xmm3, xword[AO - 0x50]);
136 movdqu(xmm5, xword[BO - 0x80]);
137 mov(H, K);
138 sar(H, 0x3);
139 jle(labels[61], T_NEAR);
140 sub(H, 0x8);
141 jle(labels[59], T_NEAR);
142 align(4);
143
144 L(labels[86]);
145 pshufd(xmm4, xmm5, 0x0);
146 movaps(xmm6, xmm4);
147 pmaddubsw(xmm6, xmm0);
148 pmaddwd(xmm6, xmm7);
149 paddd(xmm8, xmm6);
150 prefetcht0(byte[AO + 0x180]);
151 movaps(xmm6, xmm4);
152 pmaddubsw(xmm6, xmm1);
153 pmaddwd(xmm6, xmm7);
154 paddd(xmm10, xmm6);
155 movaps(xmm6, xmm4);
156 pmaddubsw(xmm6, xmm2);
157 pmaddwd(xmm6, xmm7);
158 paddd(xmm12, xmm6);
159 pmaddubsw(xmm4, xmm3);
160 pmaddwd(xmm4, xmm7);
161 paddd(xmm14, xmm4);
162 prefetcht0(byte[BO]);
163 pshufd(xmm4, xmm5, 0x55);
164 movaps(xmm6, xmm4);
165 pmaddubsw(xmm6, xmm0);
166 pmaddwd(xmm6, xmm7);
167 paddd(xmm9, xmm6);
168 movaps(xmm6, xmm4);
169 pmaddubsw(xmm6, xmm1);
170 pmaddwd(xmm6, xmm7);
171 paddd(xmm11, xmm6);
172 movaps(xmm6, xmm4);
173 pmaddubsw(xmm6, xmm2);
174 pmaddwd(xmm6, xmm7);
175 paddd(xmm13, xmm6);
176 pmaddubsw(xmm4, xmm3);
177 pmaddwd(xmm4, xmm7);
178 paddd(xmm15, xmm4);
179 movdqu(xmm0, xword[AO - 0x40]);
180 movdqu(xmm1, xword[AO - 0x30]);
181 movdqu(xmm2, xword[AO - 0x20]);
182 movdqu(xmm3, xword[AO - 0x10]);
183 prefetcht0(byte[AO + 0x1c0]);
184 pshufd(xmm4, xmm5, 0xaa);
185 movaps(xmm6, xmm4);
186 pmaddubsw(xmm6, xmm0);
187 pmaddwd(xmm6, xmm7);
188 paddd(xmm8, xmm6);
189 movaps(xmm6, xmm4);
190 pmaddubsw(xmm6, xmm1);
191 pmaddwd(xmm6, xmm7);
192 paddd(xmm10, xmm6);
193 movaps(xmm6, xmm4);
194 pmaddubsw(xmm6, xmm2);
195 pmaddwd(xmm6, xmm7);
196 paddd(xmm12, xmm6);
197 pmaddubsw(xmm4, xmm3);
198 pmaddwd(xmm4, xmm7);
199 paddd(xmm14, xmm4);
200 pshufd(xmm4, xmm5, 0xff);
201 movaps(xmm6, xmm4);
202 pmaddubsw(xmm6, xmm0);
203 pmaddwd(xmm6, xmm7);
204 paddd(xmm9, xmm6);
205 movaps(xmm6, xmm4);
206 pmaddubsw(xmm6, xmm1);
207 pmaddwd(xmm6, xmm7);
208 paddd(xmm11, xmm6);
209 movaps(xmm6, xmm4);
210 pmaddubsw(xmm6, xmm2);
211 pmaddwd(xmm6, xmm7);
212 paddd(xmm13, xmm6);
213 pmaddubsw(xmm4, xmm3);
214 pmaddwd(xmm4, xmm7);
215 paddd(xmm15, xmm4);
216 movdqu(xmm5, xword[BO - 0x70]);
217 prefetcht1(byte[AA - 0x80]);
218 movdqu(xmm0, xword[AO]);
219 movdqu(xmm1, xword[AO + 0x10]);
220 movdqu(xmm2, xword[AO + 0x20]);
221 movdqu(xmm3, xword[AO + 0x30]);
222 add(AA, 0x4);
223 add(AO, 0x80);
224 add(BO, 0x10);
225 sub(H, 0x1);
226 jg(labels[86], T_NEAR);
227 align(4);
228
229 L(labels[59]);
230 prefetcht0(byte[CO1 + 0x3c]);
231 prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
232 add(H, 0x8);
233 jle(labels[61], T_NEAR);
234 align(4);
235
236 L(labels[60]);
237 pshufd(xmm4, xmm5, 0x0);
238 movaps(xmm6, xmm4);
239 pmaddubsw(xmm6, xmm0);
240 pmaddwd(xmm6, xmm7);
241 paddd(xmm8, xmm6);
242 prefetcht0(byte[AO + 0x180]);
243 movaps(xmm6, xmm4);
244 pmaddubsw(xmm6, xmm1);
245 pmaddwd(xmm6, xmm7);
246 paddd(xmm10, xmm6);
247 movaps(xmm6, xmm4);
248 pmaddubsw(xmm6, xmm2);
249 pmaddwd(xmm6, xmm7);
250 paddd(xmm12, xmm6);
251 pmaddubsw(xmm4, xmm3);
252 pmaddwd(xmm4, xmm7);
253 paddd(xmm14, xmm4);
254 prefetcht0(byte[BO]);
255 pshufd(xmm4, xmm5, 0x55);
256 movaps(xmm6, xmm4);
257 pmaddubsw(xmm6, xmm0);
258 pmaddwd(xmm6, xmm7);
259 paddd(xmm9, xmm6);
260 movaps(xmm6, xmm4);
261 pmaddubsw(xmm6, xmm1);
262 pmaddwd(xmm6, xmm7);
263 paddd(xmm11, xmm6);
264 movaps(xmm6, xmm4);
265 pmaddubsw(xmm6, xmm2);
266 pmaddwd(xmm6, xmm7);
267 paddd(xmm13, xmm6);
268 pmaddubsw(xmm4, xmm3);
269 pmaddwd(xmm4, xmm7);
270 paddd(xmm15, xmm4);
271 movdqu(xmm0, xword[AO - 0x40]);
272 movdqu(xmm1, xword[AO - 0x30]);
273 movdqu(xmm2, xword[AO - 0x20]);
274 movdqu(xmm3, xword[AO - 0x10]);
275 prefetcht0(byte[AO + 0x1c0]);
276 pshufd(xmm4, xmm5, 0xaa);
277 movaps(xmm6, xmm4);
278 pmaddubsw(xmm6, xmm0);
279 pmaddwd(xmm6, xmm7);
280 paddd(xmm8, xmm6);
281 movaps(xmm6, xmm4);
282 pmaddubsw(xmm6, xmm1);
283 pmaddwd(xmm6, xmm7);
284 paddd(xmm10, xmm6);
285 movaps(xmm6, xmm4);
286 pmaddubsw(xmm6, xmm2);
287 pmaddwd(xmm6, xmm7);
288 paddd(xmm12, xmm6);
289 pmaddubsw(xmm4, xmm3);
290 pmaddwd(xmm4, xmm7);
291 paddd(xmm14, xmm4);
292 pshufd(xmm4, xmm5, 0xff);
293 movaps(xmm6, xmm4);
294 pmaddubsw(xmm6, xmm0);
295 pmaddwd(xmm6, xmm7);
296 paddd(xmm9, xmm6);
297 movaps(xmm6, xmm4);
298 pmaddubsw(xmm6, xmm1);
299 pmaddwd(xmm6, xmm7);
300 paddd(xmm11, xmm6);
301 movaps(xmm6, xmm4);
302 pmaddubsw(xmm6, xmm2);
303 pmaddwd(xmm6, xmm7);
304 paddd(xmm13, xmm6);
305 pmaddubsw(xmm4, xmm3);
306 pmaddwd(xmm4, xmm7);
307 paddd(xmm15, xmm4);
308 movdqu(xmm5, xword[BO - 0x70]);
309 prefetcht1(byte[AA - 0x80]);
310 movdqu(xmm0, xword[AO]);
311 movdqu(xmm1, xword[AO + 0x10]);
312 movdqu(xmm2, xword[AO + 0x20]);
313 movdqu(xmm3, xword[AO + 0x30]);
314 add(AA, 0x4);
315 add(AO, 0x80);
316 add(BO, 0x10);
317 sub(H, 0x1);
318 jg(labels[60], T_NEAR);
319 align(4);
320
321 L(labels[61]);
322 mov(H, K);
323 test(H, 0x4);
324 je(labels[62], T_NEAR);
325 pshufd(xmm4, xmm5, 0x0);
326 movaps(xmm6, xmm4);
327 pmaddubsw(xmm6, xmm0);
328 pmaddwd(xmm6, xmm7);
329 paddd(xmm8, xmm6);
330 movaps(xmm6, xmm4);
331 pmaddubsw(xmm6, xmm1);
332 pmaddwd(xmm6, xmm7);
333 paddd(xmm10, xmm6);
334 movaps(xmm6, xmm4);
335 pmaddubsw(xmm6, xmm2);
336 pmaddwd(xmm6, xmm7);
337 paddd(xmm12, xmm6);
338 pmaddubsw(xmm4, xmm3);
339 pmaddwd(xmm4, xmm7);
340 paddd(xmm14, xmm4);
341 pshufd(xmm4, xmm5, 0x55);
342 movaps(xmm6, xmm4);
343 pmaddubsw(xmm6, xmm0);
344 pmaddwd(xmm6, xmm7);
345 paddd(xmm9, xmm6);
346 movaps(xmm6, xmm4);
347 pmaddubsw(xmm6, xmm1);
348 pmaddwd(xmm6, xmm7);
349 paddd(xmm11, xmm6);
350 movaps(xmm6, xmm4);
351 pmaddubsw(xmm6, xmm2);
352 pmaddwd(xmm6, xmm7);
353 paddd(xmm13, xmm6);
354 pmaddubsw(xmm4, xmm3);
355 pmaddwd(xmm4, xmm7);
356 paddd(xmm15, xmm4);
357 add(AO, 0x40);
358 add(BO, 0x8);
359 align(4);
360
361 L(labels[62]);
362 mov(H, K);
363 test(H, 0x2);
364 je(labels[63], T_NEAR);
365 xorps(xmm6, xmm6);
366 movdqu(xmm0, xword[AO - 0x80]);
367 movaps(xmm1, xmm0);
368 punpcklwd(xmm0, xmm6);
369 punpckhwd(xmm1, xmm6);
370 movdqu(xmm2, xword[AO - 0x70]);
371 movaps(xmm3, xmm2);
372 punpcklwd(xmm2, xmm6);
373 punpckhwd(xmm3, xmm6);
374 movss(xmm5, dword[BO - 0x80]);
375 punpcklwd(xmm5, xmm5);
376 pshufd(xmm4, xmm5, 0x0);
377 movaps(xmm6, xmm4);
378 pmaddubsw(xmm6, xmm0);
379 pmaddwd(xmm6, xmm7);
380 paddd(xmm8, xmm6);
381 movaps(xmm6, xmm4);
382 pmaddubsw(xmm6, xmm1);
383 pmaddwd(xmm6, xmm7);
384 paddd(xmm10, xmm6);
385 movaps(xmm6, xmm4);
386 pmaddubsw(xmm6, xmm2);
387 pmaddwd(xmm6, xmm7);
388 paddd(xmm12, xmm6);
389 pmaddubsw(xmm4, xmm3);
390 pmaddwd(xmm4, xmm7);
391 paddd(xmm14, xmm4);
392 pshufd(xmm4, xmm5, 0x55);
393 movaps(xmm6, xmm4);
394 pmaddubsw(xmm6, xmm0);
395 pmaddwd(xmm6, xmm7);
396 paddd(xmm9, xmm6);
397 movaps(xmm6, xmm4);
398 pmaddubsw(xmm6, xmm1);
399 pmaddwd(xmm6, xmm7);
400 paddd(xmm11, xmm6);
401 movaps(xmm6, xmm4);
402 pmaddubsw(xmm6, xmm2);
403 pmaddwd(xmm6, xmm7);
404 paddd(xmm13, xmm6);
405 pmaddubsw(xmm4, xmm3);
406 pmaddwd(xmm4, xmm7);
407 paddd(xmm15, xmm4);
408 add(AO, 0x20);
409 add(BO, 0x4);
410 align(4);
411
412 L(labels[63]);
413 mov(H, K);
414 test(H, 0x1);
415 je(labels[64], T_NEAR);
416 xorps(xmm6, xmm6);
417 movdqu(xmm3, xword[AO - 0x80]);
418 pshufd(xmm0, xmm3, 0x0);
419 punpcklbw(xmm0, xmm6);
420 punpcklwd(xmm0, xmm6);
421 pshufd(xmm1, xmm3, 0x55);
422 punpcklbw(xmm1, xmm6);
423 punpcklwd(xmm1, xmm6);
424 pshufd(xmm2, xmm3, 0xaa);
425 punpcklbw(xmm2, xmm6);
426 punpcklwd(xmm2, xmm6);
427 pshufd(xmm3, xmm3, 0xff);
428 punpcklbw(xmm3, xmm6);
429 punpcklwd(xmm3, xmm6);
430 movd(xmm5, dword[BO - 0x80]);
431 punpcklbw(xmm5, xmm5);
432 punpcklwd(xmm5, xmm5);
433 pshufd(xmm4, xmm5, 0x0);
434 movaps(xmm6, xmm4);
435 pmaddubsw(xmm6, xmm0);
436 pmaddwd(xmm6, xmm7);
437 paddd(xmm8, xmm6);
438 movaps(xmm6, xmm4);
439 pmaddubsw(xmm6, xmm1);
440 pmaddwd(xmm6, xmm7);
441 paddd(xmm10, xmm6);
442 movaps(xmm6, xmm4);
443 pmaddubsw(xmm6, xmm2);
444 pmaddwd(xmm6, xmm7);
445 paddd(xmm12, xmm6);
446 pmaddubsw(xmm4, xmm3);
447 pmaddwd(xmm4, xmm7);
448 paddd(xmm14, xmm4);
449 pshufd(xmm4, xmm5, 0x55);
450 movaps(xmm6, xmm4);
451 pmaddubsw(xmm6, xmm0);
452 pmaddwd(xmm6, xmm7);
453 paddd(xmm9, xmm6);
454 movaps(xmm6, xmm4);
455 pmaddubsw(xmm6, xmm1);
456 pmaddwd(xmm6, xmm7);
457 paddd(xmm11, xmm6);
458 movaps(xmm6, xmm4);
459 pmaddubsw(xmm6, xmm2);
460 pmaddwd(xmm6, xmm7);
461 paddd(xmm13, xmm6);
462 pmaddubsw(xmm4, xmm3);
463 pmaddwd(xmm4, xmm7);
464 paddd(xmm15, xmm4);
465 add(AO, 0x10);
466 add(BO, 0x2);
467 align(4);
468
469 L(labels[64]);
470 movdqu(xword[CO1], xmm8);
471 xorps(xmm8, xmm8);
472 movdqu(xword[CO1 + 0x10], xmm10);
473 xorps(xmm10, xmm10);
474 movdqu(xword[CO1 + 0x20], xmm12);
475 xorps(xmm12, xmm12);
476 movdqu(xword[CO1 + 0x30], xmm14);
477 xorps(xmm14, xmm14);
478 movdqu(xword[CO1 + LDC * 1], xmm9);
479 xorps(xmm9, xmm9);
480 movdqu(xword[CO1 + LDC * 1 + 0x10], xmm11);
481 xorps(xmm11, xmm11);
482 movdqu(xword[CO1 + LDC * 1 + 0x20], xmm13);
483 xorps(xmm13, xmm13);
484 movdqu(xword[CO1 + LDC * 1 + 0x30], xmm15);
485 xorps(xmm15, xmm15);
486 lea(CO1, ptr[CO1 + LDC * 2]);
487 sub(I, 0x2);
488 cmp(I, 0x2);
489 jge(labels[78], T_NEAR);
490 align(4);
491
492 L(labels[65]);
493 test(I, 0x1);
494 jle(labels[74], T_NEAR);
495 mov(AO, A);
496 movdqu(xmm0, xword[AO - 0x80]);
497 movdqu(xmm1, xword[AO - 0x70]);
498 movdqu(xmm2, xword[AO - 0x60]);
499 movdqu(xmm3, xword[AO - 0x50]);
500 movdqu(xmm5, xword[BO - 0x80]);
501 mov(H, K);
502 sar(H, 0x3);
503 jle(labels[70], T_NEAR);
504 sub(H, 0x8);
505 jle(labels[67], T_NEAR);
506 align(4);
507
508 L(labels[66]);
509 pshufd(xmm4, xmm5, 0x0);
510 movaps(xmm6, xmm4);
511 pmaddubsw(xmm6, xmm0);
512 pmaddwd(xmm6, xmm7);
513 paddd(xmm8, xmm6);
514 prefetcht0(byte[AO + 0x180]);
515 movaps(xmm6, xmm4);
516 pmaddubsw(xmm6, xmm1);
517 pmaddwd(xmm6, xmm7);
518 paddd(xmm10, xmm6);
519 movaps(xmm6, xmm4);
520 pmaddubsw(xmm6, xmm2);
521 pmaddwd(xmm6, xmm7);
522 paddd(xmm12, xmm6);
523 pmaddubsw(xmm4, xmm3);
524 pmaddwd(xmm4, xmm7);
525 paddd(xmm14, xmm4);
526 prefetcht0(byte[BO]);
527 movdqu(xmm0, xword[AO - 0x40]);
528 movdqu(xmm1, xword[AO - 0x30]);
529 movdqu(xmm2, xword[AO - 0x20]);
530 movdqu(xmm3, xword[AO - 0x10]);
531 prefetcht0(byte[AO + 0x1c0]);
532 pshufd(xmm4, xmm5, 0x55);
533 movaps(xmm6, xmm4);
534 pmaddubsw(xmm6, xmm0);
535 pmaddwd(xmm6, xmm7);
536 paddd(xmm8, xmm6);
537 movaps(xmm6, xmm4);
538 pmaddubsw(xmm6, xmm1);
539 pmaddwd(xmm6, xmm7);
540 paddd(xmm10, xmm6);
541 movaps(xmm6, xmm4);
542 pmaddubsw(xmm6, xmm2);
543 pmaddwd(xmm6, xmm7);
544 paddd(xmm12, xmm6);
545 pmaddubsw(xmm4, xmm3);
546 pmaddwd(xmm4, xmm7);
547 paddd(xmm14, xmm4);
548 movdqu(xmm5, xword[BO - 0x78]);
549 prefetcht1(byte[AA - 0x80]);
550 movdqu(xmm0, xword[AO]);
551 movdqu(xmm1, xword[AO + 0x10]);
552 movdqu(xmm2, xword[AO + 0x20]);
553 movdqu(xmm3, xword[AO + 0x30]);
554 add(AA, 0x4);
555 add(AO, 0x80);
556 add(BO, 0x8);
557 sub(H, 0x1);
558 jg(labels[66], T_NEAR);
559 align(4);
560
561 L(labels[67]);
562 prefetcht0(byte[CO1 + 0x3c]);
563 add(H, 0x8);
564 jle(labels[70], T_NEAR);
565 align(4);
566
567 L(labels[68]);
568 pshufd(xmm4, xmm5, 0x0);
569 movaps(xmm6, xmm4);
570 pmaddubsw(xmm6, xmm0);
571 pmaddwd(xmm6, xmm7);
572 paddd(xmm8, xmm6);
573 prefetcht0(byte[AO + 0x180]);
574 movaps(xmm6, xmm4);
575 pmaddubsw(xmm6, xmm1);
576 pmaddwd(xmm6, xmm7);
577 paddd(xmm10, xmm6);
578 movaps(xmm6, xmm4);
579 pmaddubsw(xmm6, xmm2);
580 pmaddwd(xmm6, xmm7);
581 paddd(xmm12, xmm6);
582 pmaddubsw(xmm4, xmm3);
583 pmaddwd(xmm4, xmm7);
584 paddd(xmm14, xmm4);
585 prefetcht0(byte[BO]);
586 movdqu(xmm0, xword[AO - 0x40]);
587 movdqu(xmm1, xword[AO - 0x30]);
588 movdqu(xmm2, xword[AO - 0x20]);
589 movdqu(xmm3, xword[AO - 0x10]);
590 prefetcht0(byte[AO + 0x1c0]);
591 pshufd(xmm4, xmm5, 0x55);
592 movaps(xmm6, xmm4);
593 pmaddubsw(xmm6, xmm0);
594 pmaddwd(xmm6, xmm7);
595 paddd(xmm8, xmm6);
596 movaps(xmm6, xmm4);
597 pmaddubsw(xmm6, xmm1);
598 pmaddwd(xmm6, xmm7);
599 paddd(xmm10, xmm6);
600 movaps(xmm6, xmm4);
601 pmaddubsw(xmm6, xmm2);
602 pmaddwd(xmm6, xmm7);
603 paddd(xmm12, xmm6);
604 pmaddubsw(xmm4, xmm3);
605 pmaddwd(xmm4, xmm7);
606 paddd(xmm14, xmm4);
607 movdqu(xmm5, xword[BO - 0x78]);
608 prefetcht1(byte[AA - 0x80]);
609 movdqu(xmm0, xword[AO]);
610 movdqu(xmm1, xword[AO + 0x10]);
611 movdqu(xmm2, xword[AO + 0x20]);
612 movdqu(xmm3, xword[AO + 0x30]);
613 add(AA, 0x4);
614 add(AO, 0x80);
615 add(BO, 0x8);
616 sub(H, 0x1);
617 jg(labels[68], T_NEAR);
618 align(4);
619
620 L(labels[70]);
621 mov(H, K);
622 test(H, 0x4);
623 je(labels[71], T_NEAR);
624 pshufd(xmm4, xmm5, 0x0);
625 movaps(xmm6, xmm4);
626 pmaddubsw(xmm6, xmm0);
627 pmaddwd(xmm6, xmm7);
628 paddd(xmm8, xmm6);
629 movaps(xmm6, xmm4);
630 pmaddubsw(xmm6, xmm1);
631 pmaddwd(xmm6, xmm7);
632 paddd(xmm10, xmm6);
633 movaps(xmm6, xmm4);
634 pmaddubsw(xmm6, xmm2);
635 pmaddwd(xmm6, xmm7);
636 paddd(xmm12, xmm6);
637 pmaddubsw(xmm4, xmm3);
638 pmaddwd(xmm4, xmm7);
639 paddd(xmm14, xmm4);
640 add(AO, 0x40);
641 add(BO, 0x4);
642 align(4);
643
644 L(labels[71]);
645 mov(H, K);
646 test(H, 0x2);
647 je(labels[72], T_NEAR);
648 xorps(xmm6, xmm6);
649 movdqu(xmm0, xword[AO - 0x80]);
650 movaps(xmm1, xmm0);
651 punpcklwd(xmm0, xmm6);
652 punpckhwd(xmm1, xmm6);
653 movdqu(xmm2, xword[AO - 0x70]);
654 movaps(xmm3, xmm2);
655 punpcklwd(xmm2, xmm6);
656 punpckhwd(xmm3, xmm6);
657 movss(xmm5, dword[BO - 0x80]);
658 punpcklwd(xmm5, xmm5);
659 pshufd(xmm4, xmm5, 0x0);
660 movaps(xmm6, xmm4);
661 pmaddubsw(xmm6, xmm0);
662 pmaddwd(xmm6, xmm7);
663 paddd(xmm8, xmm6);
664 movaps(xmm6, xmm4);
665 pmaddubsw(xmm6, xmm1);
666 pmaddwd(xmm6, xmm7);
667 paddd(xmm10, xmm6);
668 movaps(xmm6, xmm4);
669 pmaddubsw(xmm6, xmm2);
670 pmaddwd(xmm6, xmm7);
671 paddd(xmm12, xmm6);
672 pmaddubsw(xmm4, xmm3);
673 pmaddwd(xmm4, xmm7);
674 paddd(xmm14, xmm4);
675 add(AO, 0x20);
676 add(BO, 0x2);
677 align(4);
678
679 L(labels[72]);
680 mov(H, K);
681 test(H, 0x1);
682 je(labels[73], T_NEAR);
683 xorps(xmm6, xmm6);
684 movdqu(xmm3, xword[AO - 0x80]);
685 pshufd(xmm0, xmm3, 0x0);
686 punpcklbw(xmm0, xmm6);
687 punpcklwd(xmm0, xmm6);
688 pshufd(xmm1, xmm3, 0x55);
689 punpcklbw(xmm1, xmm6);
690 punpcklwd(xmm1, xmm6);
691 pshufd(xmm2, xmm3, 0xaa);
692 punpcklbw(xmm2, xmm6);
693 punpcklwd(xmm2, xmm6);
694 pshufd(xmm3, xmm3, 0xff);
695 punpcklbw(xmm3, xmm6);
696 punpcklwd(xmm3, xmm6);
697 movd(xmm5, dword[BO - 0x80]);
698 punpcklbw(xmm5, xmm5);
699 punpcklwd(xmm5, xmm5);
700 pshufd(xmm4, xmm5, 0x0);
701 movaps(xmm6, xmm4);
702 pmaddubsw(xmm6, xmm0);
703 pmaddwd(xmm6, xmm7);
704 paddd(xmm8, xmm6);
705 movaps(xmm6, xmm4);
706 pmaddubsw(xmm6, xmm1);
707 pmaddwd(xmm6, xmm7);
708 paddd(xmm10, xmm6);
709 movaps(xmm6, xmm4);
710 pmaddubsw(xmm6, xmm2);
711 pmaddwd(xmm6, xmm7);
712 paddd(xmm12, xmm6);
713 pmaddubsw(xmm4, xmm3);
714 pmaddwd(xmm4, xmm7);
715 paddd(xmm14, xmm4);
716 add(AO, 0x10);
717 add(BO, 0x1);
718 align(4);
719
720 L(labels[73]);
721 movdqu(xword[CO1], xmm8);
722 xorps(xmm8, xmm8);
723 movdqu(xword[CO1 + 0x10], xmm10);
724 xorps(xmm10, xmm10);
725 movdqu(xword[CO1 + 0x20], xmm12);
726 xorps(xmm12, xmm12);
727 movdqu(xword[CO1 + 0x30], xmm14);
728 xorps(xmm14, xmm14);
729 lea(CO1, ptr[CO1 + LDC * 1]);
730 align(4);
731
732 L(labels[74]);
733 mov(A, AO);
734 sub(J, 0x10);
735 cmp(J, 0x10);
736 jge(labels[69], T_NEAR);
737 align(4);
738
739 L(labels[75]);
740 test(J, 0x8);
741 jle(labels[4], T_NEAR);
742 mov(CO1, C);
743 add(C, 0x20);
744 mov(BO, B);
745 mov(AA, K);
746 shl(AA, 0x10);
747 lea(AA, ptr[A + AA * 1 + 0x200]);
748 mov(I, N);
749 cmp(I, 0x2);
750 jl(labels[85], T_NEAR);
751 align(4);
752
753 L(labels[76]);
754 mov(AO, A);
755 movdqu(xmm0, xword[AO - 0x80]);
756 movdqu(xmm1, xword[AO - 0x70]);
757 movdqu(xmm5, xword[BO - 0x80]);
758 mov(H, K);
759 sar(H, 0x3);
760 jle(labels[81], T_NEAR);
761 sub(H, 0x8);
762 jle(labels[79], T_NEAR);
763 align(4);
764
765 L(labels[77]);
766 pshufd(xmm4, xmm5, 0x0);
767 movaps(xmm6, xmm4);
768 pmaddubsw(xmm6, xmm0);
769 pmaddwd(xmm6, xmm7);
770 paddd(xmm8, xmm6);
771 prefetcht0(byte[AO + 0x180]);
772 pmaddubsw(xmm4, xmm1);
773 pmaddwd(xmm4, xmm7);
774 paddd(xmm10, xmm4);
775 prefetcht0(byte[BO]);
776 pshufd(xmm4, xmm5, 0x55);
777 movaps(xmm6, xmm4);
778 pmaddubsw(xmm6, xmm0);
779 pmaddwd(xmm6, xmm7);
780 paddd(xmm9, xmm6);
781 pmaddubsw(xmm4, xmm1);
782 pmaddwd(xmm4, xmm7);
783 paddd(xmm11, xmm4);
784 movdqu(xmm0, xword[AO - 0x60]);
785 movdqu(xmm1, xword[AO - 0x50]);
786 prefetcht0(byte[AO + 0x1c0]);
787 pshufd(xmm4, xmm5, 0xaa);
788 movaps(xmm6, xmm4);
789 pmaddubsw(xmm6, xmm0);
790 pmaddwd(xmm6, xmm7);
791 paddd(xmm8, xmm6);
792 pmaddubsw(xmm4, xmm1);
793 pmaddwd(xmm4, xmm7);
794 paddd(xmm10, xmm4);
795 pshufd(xmm4, xmm5, 0xff);
796 movaps(xmm6, xmm4);
797 pmaddubsw(xmm6, xmm0);
798 pmaddwd(xmm6, xmm7);
799 paddd(xmm9, xmm6);
800 pmaddubsw(xmm4, xmm1);
801 pmaddwd(xmm4, xmm7);
802 paddd(xmm11, xmm4);
803 movdqu(xmm5, xword[BO - 0x70]);
804 prefetcht1(byte[AA - 0x80]);
805 movdqu(xmm0, xword[AO - 0x40]);
806 movdqu(xmm1, xword[AO - 0x30]);
807 add(AA, 0x4);
808 add(AO, 0x40);
809 add(BO, 0x10);
810 sub(H, 0x1);
811 jg(labels[77], T_NEAR);
812 align(4);
813
814 L(labels[79]);
815 prefetcht0(byte[CO1 + 0x3c]);
816 prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
817 add(H, 0x8);
818 jle(labels[81], T_NEAR);
819 align(4);
820
821 L(labels[80]);
822 pshufd(xmm4, xmm5, 0x0);
823 movaps(xmm6, xmm4);
824 pmaddubsw(xmm6, xmm0);
825 pmaddwd(xmm6, xmm7);
826 paddd(xmm8, xmm6);
827 prefetcht0(byte[AO + 0x180]);
828 pmaddubsw(xmm4, xmm1);
829 pmaddwd(xmm4, xmm7);
830 paddd(xmm10, xmm4);
831 prefetcht0(byte[BO]);
832 pshufd(xmm4, xmm5, 0x55);
833 movaps(xmm6, xmm4);
834 pmaddubsw(xmm6, xmm0);
835 pmaddwd(xmm6, xmm7);
836 paddd(xmm9, xmm6);
837 pmaddubsw(xmm4, xmm1);
838 pmaddwd(xmm4, xmm7);
839 paddd(xmm11, xmm4);
840 movdqu(xmm0, xword[AO - 0x60]);
841 movdqu(xmm1, xword[AO - 0x50]);
842 prefetcht0(byte[AO + 0x1c0]);
843 pshufd(xmm4, xmm5, 0xaa);
844 movaps(xmm6, xmm4);
845 pmaddubsw(xmm6, xmm0);
846 pmaddwd(xmm6, xmm7);
847 paddd(xmm8, xmm6);
848 pmaddubsw(xmm4, xmm1);
849 pmaddwd(xmm4, xmm7);
850 paddd(xmm10, xmm4);
851 pshufd(xmm4, xmm5, 0xff);
852 movaps(xmm6, xmm4);
853 pmaddubsw(xmm6, xmm0);
854 pmaddwd(xmm6, xmm7);
855 paddd(xmm9, xmm6);
856 pmaddubsw(xmm4, xmm1);
857 pmaddwd(xmm4, xmm7);
858 paddd(xmm11, xmm4);
859 movdqu(xmm5, xword[BO - 0x70]);
860 prefetcht1(byte[AA - 0x80]);
861 movdqu(xmm0, xword[AO - 0x40]);
862 movdqu(xmm1, xword[AO - 0x30]);
863 add(AA, 0x4);
864 add(AO, 0x40);
865 add(BO, 0x10);
866 sub(H, 0x1);
867 jg(labels[80], T_NEAR);
868 align(4);
869
870 L(labels[81]);
871 mov(H, K);
872 test(H, 0x4);
873 je(labels[82], T_NEAR);
874 pshufd(xmm4, xmm5, 0x0);
875 movaps(xmm6, xmm4);
876 pmaddubsw(xmm6, xmm0);
877 pmaddwd(xmm6, xmm7);
878 paddd(xmm8, xmm6);
879 pmaddubsw(xmm4, xmm1);
880 pmaddwd(xmm4, xmm7);
881 paddd(xmm10, xmm4);
882 pshufd(xmm4, xmm5, 0x55);
883 movaps(xmm6, xmm4);
884 pmaddubsw(xmm6, xmm0);
885 pmaddwd(xmm6, xmm7);
886 paddd(xmm9, xmm6);
887 pmaddubsw(xmm4, xmm1);
888 pmaddwd(xmm4, xmm7);
889 paddd(xmm11, xmm4);
890 add(AO, 0x20);
891 add(BO, 0x8);
892 align(4);
893
894 L(labels[82]);
895 mov(H, K);
896 test(H, 0x2);
897 je(labels[83], T_NEAR);
898 xorps(xmm6, xmm6);
899 movdqu(xmm0, xword[AO - 0x80]);
900 movaps(xmm1, xmm0);
901 punpcklwd(xmm0, xmm6);
902 punpckhwd(xmm1, xmm6);
903 movss(xmm5, dword[BO - 0x80]);
904 punpcklwd(xmm5, xmm5);
905 pshufd(xmm4, xmm5, 0x0);
906 movaps(xmm6, xmm4);
907 pmaddubsw(xmm6, xmm0);
908 pmaddwd(xmm6, xmm7);
909 paddd(xmm8, xmm6);
910 pmaddubsw(xmm4, xmm1);
911 pmaddwd(xmm4, xmm7);
912 paddd(xmm10, xmm4);
913 pshufd(xmm4, xmm5, 0x55);
914 movaps(xmm6, xmm4);
915 pmaddubsw(xmm6, xmm0);
916 pmaddwd(xmm6, xmm7);
917 paddd(xmm9, xmm6);
918 pmaddubsw(xmm4, xmm1);
919 pmaddwd(xmm4, xmm7);
920 paddd(xmm11, xmm4);
921 add(AO, 0x10);
922 add(BO, 0x4);
923 align(4);
924
925 L(labels[83]);
926 mov(H, K);
927 test(H, 0x1);
928 je(labels[84], T_NEAR);
929 xorps(xmm6, xmm6);
930 movdqu(xmm3, xword[AO - 0x80]);
931 pshufd(xmm0, xmm3, 0x0);
932 punpcklbw(xmm0, xmm6);
933 punpcklwd(xmm0, xmm6);
934 pshufd(xmm1, xmm3, 0x55);
935 punpcklbw(xmm1, xmm6);
936 punpcklwd(xmm1, xmm6);
937 movd(xmm5, dword[BO - 0x80]);
938 punpcklbw(xmm5, xmm5);
939 punpcklwd(xmm5, xmm5);
940 pshufd(xmm4, xmm5, 0x0);
941 movaps(xmm6, xmm4);
942 pmaddubsw(xmm6, xmm0);
943 pmaddwd(xmm6, xmm7);
944 paddd(xmm8, xmm6);
945 pmaddubsw(xmm4, xmm1);
946 pmaddwd(xmm4, xmm7);
947 paddd(xmm10, xmm4);
948 pshufd(xmm4, xmm5, 0x55);
949 movaps(xmm6, xmm4);
950 pmaddubsw(xmm6, xmm0);
951 pmaddwd(xmm6, xmm7);
952 paddd(xmm9, xmm6);
953 pmaddubsw(xmm4, xmm1);
954 pmaddwd(xmm4, xmm7);
955 paddd(xmm11, xmm4);
956 add(AO, 0x8);
957 add(BO, 0x2);
958 align(4);
959
960 L(labels[84]);
961 movdqu(xword[CO1], xmm8);
962 xorps(xmm8, xmm8);
963 movdqu(xword[CO1 + 0x10], xmm10);
964 xorps(xmm10, xmm10);
965 movdqu(xword[CO1 + LDC * 1], xmm9);
966 xorps(xmm9, xmm9);
967 movdqu(xword[CO1 + LDC * 1 + 0x10], xmm11);
968 xorps(xmm11, xmm11);
969 lea(CO1, ptr[CO1 + LDC * 2]);
970 sub(I, 0x2);
971 cmp(I, 0x2);
972 jge(labels[76], T_NEAR);
973 align(4);
974
975 L(labels[85]);
976 test(I, 0x1);
977 jle(labels[3], T_NEAR);
978 mov(AO, A);
979 movdqu(xmm0, xword[AO - 0x80]);
980 movdqu(xmm1, xword[AO - 0x70]);
981 movdqu(xmm5, xword[BO - 0x80]);
982 mov(H, K);
983 sar(H, 0x3);
984 jle(labels[90], T_NEAR);
985 sub(H, 0x8);
986 jle(labels[88], T_NEAR);
987 align(4);
988
989 L(labels[87]);
990 pshufd(xmm4, xmm5, 0x0);
991 movaps(xmm6, xmm4);
992 pmaddubsw(xmm6, xmm0);
993 pmaddwd(xmm6, xmm7);
994 paddd(xmm8, xmm6);
995 prefetcht0(byte[AO + 0x180]);
996 pmaddubsw(xmm4, xmm1);
997 pmaddwd(xmm4, xmm7);
998 paddd(xmm10, xmm4);
999 prefetcht0(byte[BO]);
1000 movdqu(xmm0, xword[AO - 0x60]);
1001 movdqu(xmm1, xword[AO - 0x50]);
1002 prefetcht0(byte[AO + 0x1c0]);
1003 pshufd(xmm4, xmm5, 0x55);
1004 movaps(xmm6, xmm4);
1005 pmaddubsw(xmm6, xmm0);
1006 pmaddwd(xmm6, xmm7);
1007 paddd(xmm8, xmm6);
1008 pmaddubsw(xmm4, xmm1);
1009 pmaddwd(xmm4, xmm7);
1010 paddd(xmm10, xmm4);
1011 movdqu(xmm5, xword[BO - 0x78]);
1012 prefetcht1(byte[AA - 0x80]);
1013 movdqu(xmm0, xword[AO - 0x40]);
1014 movdqu(xmm1, xword[AO - 0x30]);
1015 add(AA, 0x4);
1016 add(AO, 0x40);
1017 add(BO, 0x8);
1018 sub(H, 0x1);
1019 jg(labels[87], T_NEAR);
1020 align(4);
1021
1022 L(labels[88]);
1023 prefetcht0(byte[CO1 + 0x3c]);
1024 add(H, 0x8);
1025 jle(labels[90], T_NEAR);
1026 align(4);
1027
1028 L(labels[89]);
1029 pshufd(xmm4, xmm5, 0x0);
1030 movaps(xmm6, xmm4);
1031 pmaddubsw(xmm6, xmm0);
1032 pmaddwd(xmm6, xmm7);
1033 paddd(xmm8, xmm6);
1034 prefetcht0(byte[AO + 0x180]);
1035 pmaddubsw(xmm4, xmm1);
1036 pmaddwd(xmm4, xmm7);
1037 paddd(xmm10, xmm4);
1038 prefetcht0(byte[BO]);
1039 movdqu(xmm0, xword[AO - 0x60]);
1040 movdqu(xmm1, xword[AO - 0x50]);
1041 prefetcht0(byte[AO + 0x1c0]);
1042 pshufd(xmm4, xmm5, 0x55);
1043 movaps(xmm6, xmm4);
1044 pmaddubsw(xmm6, xmm0);
1045 pmaddwd(xmm6, xmm7);
1046 paddd(xmm8, xmm6);
1047 pmaddubsw(xmm4, xmm1);
1048 pmaddwd(xmm4, xmm7);
1049 paddd(xmm10, xmm4);
1050 movdqu(xmm5, xword[BO - 0x78]);
1051 prefetcht1(byte[AA - 0x80]);
1052 movdqu(xmm0, xword[AO - 0x40]);
1053 movdqu(xmm1, xword[AO - 0x30]);
1054 add(AA, 0x4);
1055 add(AO, 0x40);
1056 add(BO, 0x8);
1057 sub(H, 0x1);
1058 jg(labels[89], T_NEAR);
1059 align(4);
1060
1061 L(labels[90]);
1062 mov(H, K);
1063 test(H, 0x4);
1064 je(labels[0], T_NEAR);
1065 pshufd(xmm4, xmm5, 0x0);
1066 movaps(xmm6, xmm4);
1067 pmaddubsw(xmm6, xmm0);
1068 pmaddwd(xmm6, xmm7);
1069 paddd(xmm8, xmm6);
1070 pmaddubsw(xmm4, xmm1);
1071 pmaddwd(xmm4, xmm7);
1072 paddd(xmm10, xmm4);
1073 add(AO, 0x20);
1074 add(BO, 0x4);
1075 align(4);
1076
1077 L(labels[0]);
1078 mov(H, K);
1079 test(H, 0x2);
1080 je(labels[1], T_NEAR);
1081 xorps(xmm6, xmm6);
1082 movdqu(xmm0, xword[AO - 0x80]);
1083 movaps(xmm1, xmm0);
1084 punpcklwd(xmm0, xmm6);
1085 punpckhwd(xmm1, xmm6);
1086 movss(xmm5, dword[BO - 0x80]);
1087 punpcklwd(xmm5, xmm5);
1088 pshufd(xmm4, xmm5, 0x0);
1089 movaps(xmm6, xmm4);
1090 pmaddubsw(xmm6, xmm0);
1091 pmaddwd(xmm6, xmm7);
1092 paddd(xmm8, xmm6);
1093 pmaddubsw(xmm4, xmm1);
1094 pmaddwd(xmm4, xmm7);
1095 paddd(xmm10, xmm4);
1096 add(AO, 0x10);
1097 add(BO, 0x2);
1098 align(4);
1099
1100 L(labels[1]);
1101 mov(H, K);
1102 test(H, 0x1);
1103 je(labels[2], T_NEAR);
1104 xorps(xmm6, xmm6);
1105 movdqu(xmm3, xword[AO - 0x80]);
1106 pshufd(xmm0, xmm3, 0x0);
1107 punpcklbw(xmm0, xmm6);
1108 punpcklwd(xmm0, xmm6);
1109 pshufd(xmm1, xmm3, 0x55);
1110 punpcklbw(xmm1, xmm6);
1111 punpcklwd(xmm1, xmm6);
1112 movd(xmm5, dword[BO - 0x80]);
1113 punpcklbw(xmm5, xmm5);
1114 punpcklwd(xmm5, xmm5);
1115 pshufd(xmm4, xmm5, 0x0);
1116 movaps(xmm6, xmm4);
1117 pmaddubsw(xmm6, xmm0);
1118 pmaddwd(xmm6, xmm7);
1119 paddd(xmm8, xmm6);
1120 pmaddubsw(xmm4, xmm1);
1121 pmaddwd(xmm4, xmm7);
1122 paddd(xmm10, xmm4);
1123 add(AO, 0x8);
1124 add(BO, 0x1);
1125 align(4);
1126
1127 L(labels[2]);
1128 movdqu(xword[CO1], xmm8);
1129 xorps(xmm8, xmm8);
1130 movdqu(xword[CO1 + 0x10], xmm10);
1131 xorps(xmm10, xmm10);
1132 lea(CO1, ptr[CO1 + LDC * 1]);
1133 align(4);
1134
1135 L(labels[3]);
1136 mov(A, AO);
1137 align(4);
1138
1139 L(labels[4]);
1140 test(J, 0x4);
1141 jle(labels[22], T_NEAR);
1142 mov(CO1, C);
1143 add(C, 0x10);
1144 mov(BO, B);
1145 mov(AA, K);
1146 shl(AA, 0x8);
1147 lea(AA, ptr[A + AA * 1 + 0x200]);
1148 mov(I, N);
1149 cmp(I, 0x2);
1150 jl(labels[13], T_NEAR);
1151 align(4);
1152
1153 L(labels[5]);
1154 mov(AO, A);
1155 movdqu(xmm0, xword[AO - 0x80]);
1156 movdqu(xmm5, xword[BO - 0x80]);
1157 mov(H, K);
1158 sar(H, 0x3);
1159 jle(labels[9], T_NEAR);
1160 sub(H, 0x8);
1161 jle(labels[7], T_NEAR);
1162 align(4);
1163
1164 L(labels[6]);
1165 pshufd(xmm4, xmm5, 0x0);
1166 pmaddubsw(xmm4, xmm0);
1167 pmaddwd(xmm4, xmm7);
1168 paddd(xmm8, xmm4);
1169 prefetcht0(byte[AO + 0x180]);
1170 prefetcht0(byte[BO]);
1171 pshufd(xmm4, xmm5, 0x55);
1172 pmaddubsw(xmm4, xmm0);
1173 pmaddwd(xmm4, xmm7);
1174 paddd(xmm9, xmm4);
1175 movdqu(xmm0, xword[AO - 0x70]);
1176 prefetcht0(byte[AO + 0x1c0]);
1177 pshufd(xmm4, xmm5, 0xaa);
1178 pmaddubsw(xmm4, xmm0);
1179 pmaddwd(xmm4, xmm7);
1180 paddd(xmm8, xmm4);
1181 pshufd(xmm4, xmm5, 0xff);
1182 pmaddubsw(xmm4, xmm0);
1183 pmaddwd(xmm4, xmm7);
1184 paddd(xmm9, xmm4);
1185 movdqu(xmm5, xword[BO - 0x70]);
1186 prefetcht1(byte[AA - 0x80]);
1187 movdqu(xmm0, xword[AO - 0x60]);
1188 add(AA, 0x4);
1189 add(AO, 0x20);
1190 add(BO, 0x10);
1191 sub(H, 0x1);
1192 jg(labels[6], T_NEAR);
1193 align(4);
1194
1195 L(labels[7]);
1196 prefetcht0(byte[CO1 + 0x3c]);
1197 prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
1198 add(H, 0x8);
1199 jle(labels[9], T_NEAR);
1200 align(4);
1201
1202 L(labels[8]);
1203 pshufd(xmm4, xmm5, 0x0);
1204 pmaddubsw(xmm4, xmm0);
1205 pmaddwd(xmm4, xmm7);
1206 paddd(xmm8, xmm4);
1207 prefetcht0(byte[AO + 0x180]);
1208 prefetcht0(byte[BO]);
1209 pshufd(xmm4, xmm5, 0x55);
1210 pmaddubsw(xmm4, xmm0);
1211 pmaddwd(xmm4, xmm7);
1212 paddd(xmm9, xmm4);
1213 movdqu(xmm0, xword[AO - 0x70]);
1214 prefetcht0(byte[AO + 0x1c0]);
1215 pshufd(xmm4, xmm5, 0xaa);
1216 pmaddubsw(xmm4, xmm0);
1217 pmaddwd(xmm4, xmm7);
1218 paddd(xmm8, xmm4);
1219 pshufd(xmm4, xmm5, 0xff);
1220 pmaddubsw(xmm4, xmm0);
1221 pmaddwd(xmm4, xmm7);
1222 paddd(xmm9, xmm4);
1223 movdqu(xmm5, xword[BO - 0x70]);
1224 prefetcht1(byte[AA - 0x80]);
1225 movdqu(xmm0, xword[AO - 0x60]);
1226 add(AA, 0x4);
1227 add(AO, 0x20);
1228 add(BO, 0x10);
1229 sub(H, 0x1);
1230 jg(labels[8], T_NEAR);
1231 align(4);
1232
1233 L(labels[9]);
1234 mov(H, K);
1235 test(H, 0x4);
1236 je(labels[10], T_NEAR);
1237 pshufd(xmm4, xmm5, 0x0);
1238 pmaddubsw(xmm4, xmm0);
1239 pmaddwd(xmm4, xmm7);
1240 paddd(xmm8, xmm4);
1241 pshufd(xmm4, xmm5, 0x55);
1242 pmaddubsw(xmm4, xmm0);
1243 pmaddwd(xmm4, xmm7);
1244 paddd(xmm9, xmm4);
1245 add(AO, 0x10);
1246 add(BO, 0x8);
1247 align(4);
1248
1249 L(labels[10]);
1250 mov(H, K);
1251 test(H, 0x2);
1252 je(labels[11], T_NEAR);
1253 xorps(xmm6, xmm6);
1254 movdqu(xmm0, xword[AO - 0x80]);
1255 movaps(xmm1, xmm0);
1256 punpcklwd(xmm0, xmm6);
1257 movss(xmm5, dword[BO - 0x80]);
1258 punpcklwd(xmm5, xmm5);
1259 pshufd(xmm4, xmm5, 0x0);
1260 pmaddubsw(xmm4, xmm0);
1261 pmaddwd(xmm4, xmm7);
1262 paddd(xmm8, xmm4);
1263 pshufd(xmm4, xmm5, 0x55);
1264 pmaddubsw(xmm4, xmm0);
1265 pmaddwd(xmm4, xmm7);
1266 paddd(xmm9, xmm4);
1267 add(AO, 0x8);
1268 add(BO, 0x4);
1269 align(4);
1270
1271 L(labels[11]);
1272 mov(H, K);
1273 test(H, 0x1);
1274 je(labels[12], T_NEAR);
1275 xorps(xmm6, xmm6);
1276 movdqu(xmm3, xword[AO - 0x80]);
1277 pshufd(xmm0, xmm3, 0x0);
1278 punpcklbw(xmm0, xmm6);
1279 punpcklwd(xmm0, xmm6);
1280 movd(xmm5, dword[BO - 0x80]);
1281 punpcklbw(xmm5, xmm5);
1282 punpcklwd(xmm5, xmm5);
1283 pshufd(xmm4, xmm5, 0x0);
1284 pmaddubsw(xmm4, xmm0);
1285 pmaddwd(xmm4, xmm7);
1286 paddd(xmm8, xmm4);
1287 pshufd(xmm4, xmm5, 0x55);
1288 pmaddubsw(xmm4, xmm0);
1289 pmaddwd(xmm4, xmm7);
1290 paddd(xmm9, xmm4);
1291 add(AO, 0x4);
1292 add(BO, 0x2);
1293 align(4);
1294
1295 L(labels[12]);
1296 movdqu(xword[CO1], xmm8);
1297 xorps(xmm8, xmm8);
1298 movdqu(xword[CO1 + LDC * 1], xmm9);
1299 xorps(xmm9, xmm9);
1300 lea(CO1, ptr[CO1 + LDC * 2]);
1301 sub(I, 0x2);
1302 cmp(I, 0x2);
1303 jge(labels[5], T_NEAR);
1304 align(4);
1305
1306 L(labels[13]);
1307 test(I, 0x1);
1308 jle(labels[21], T_NEAR);
1309 mov(AO, A);
1310 movdqu(xmm0, xword[AO - 0x80]);
1311 movdqu(xmm5, xword[BO - 0x80]);
1312 mov(H, K);
1313 sar(H, 0x3);
1314 jle(labels[17], T_NEAR);
1315 sub(H, 0x8);
1316 jle(labels[15], T_NEAR);
1317 align(4);
1318
1319 L(labels[14]);
1320 pshufd(xmm4, xmm5, 0x0);
1321 pmaddubsw(xmm4, xmm0);
1322 pmaddwd(xmm4, xmm7);
1323 paddd(xmm8, xmm4);
1324 prefetcht0(byte[AO + 0x180]);
1325 prefetcht0(byte[BO]);
1326 movdqu(xmm0, xword[AO - 0x70]);
1327 prefetcht0(byte[AO + 0x1c0]);
1328 pshufd(xmm4, xmm5, 0x55);
1329 pmaddubsw(xmm4, xmm0);
1330 pmaddwd(xmm4, xmm7);
1331 paddd(xmm8, xmm4);
1332 movdqu(xmm5, xword[BO - 0x78]);
1333 prefetcht1(byte[AA - 0x80]);
1334 movdqu(xmm0, xword[AO - 0x60]);
1335 add(AA, 0x4);
1336 add(AO, 0x20);
1337 add(BO, 0x8);
1338 sub(H, 0x1);
1339 jg(labels[14], T_NEAR);
1340 align(4);
1341
1342 L(labels[15]);
1343 prefetcht0(byte[CO1 + 0x3c]);
1344 add(H, 0x8);
1345 jle(labels[17], T_NEAR);
1346 align(4);
1347
1348 L(labels[16]);
1349 pshufd(xmm4, xmm5, 0x0);
1350 pmaddubsw(xmm4, xmm0);
1351 pmaddwd(xmm4, xmm7);
1352 paddd(xmm8, xmm4);
1353 prefetcht0(byte[AO + 0x180]);
1354 prefetcht0(byte[BO]);
1355 movdqu(xmm0, xword[AO - 0x70]);
1356 prefetcht0(byte[AO + 0x1c0]);
1357 pshufd(xmm4, xmm5, 0x55);
1358 pmaddubsw(xmm4, xmm0);
1359 pmaddwd(xmm4, xmm7);
1360 paddd(xmm8, xmm4);
1361 movdqu(xmm5, xword[BO - 0x78]);
1362 prefetcht1(byte[AA - 0x80]);
1363 movdqu(xmm0, xword[AO - 0x60]);
1364 add(AA, 0x4);
1365 add(AO, 0x20);
1366 add(BO, 0x8);
1367 sub(H, 0x1);
1368 jg(labels[16], T_NEAR);
1369 align(4);
1370
1371 L(labels[17]);
1372 mov(H, K);
1373 test(H, 0x4);
1374 je(labels[18], T_NEAR);
1375 pshufd(xmm4, xmm5, 0x0);
1376 pmaddubsw(xmm4, xmm0);
1377 pmaddwd(xmm4, xmm7);
1378 paddd(xmm8, xmm4);
1379 add(AO, 0x10);
1380 add(BO, 0x4);
1381 align(4);
1382
1383 L(labels[18]);
1384 mov(H, K);
1385 test(H, 0x2);
1386 je(labels[19], T_NEAR);
1387 xorps(xmm6, xmm6);
1388 movdqu(xmm0, xword[AO - 0x80]);
1389 movaps(xmm1, xmm0);
1390 punpcklwd(xmm0, xmm6);
1391 movss(xmm5, dword[BO - 0x80]);
1392 punpcklwd(xmm5, xmm5);
1393 pshufd(xmm4, xmm5, 0x0);
1394 pmaddubsw(xmm4, xmm0);
1395 pmaddwd(xmm4, xmm7);
1396 paddd(xmm8, xmm4);
1397 add(AO, 0x8);
1398 add(BO, 0x2);
1399 align(4);
1400
1401 L(labels[19]);
1402 mov(H, K);
1403 test(H, 0x1);
1404 je(labels[20], T_NEAR);
1405 xorps(xmm6, xmm6);
1406 movdqu(xmm3, xword[AO - 0x80]);
1407 pshufd(xmm0, xmm3, 0x0);
1408 punpcklbw(xmm0, xmm6);
1409 punpcklwd(xmm0, xmm6);
1410 movd(xmm5, dword[BO - 0x80]);
1411 punpcklbw(xmm5, xmm5);
1412 punpcklwd(xmm5, xmm5);
1413 pshufd(xmm4, xmm5, 0x0);
1414 pmaddubsw(xmm4, xmm0);
1415 pmaddwd(xmm4, xmm7);
1416 paddd(xmm8, xmm4);
1417 add(AO, 0x4);
1418 add(BO, 0x1);
1419 align(4);
1420
1421 L(labels[20]);
1422 movdqu(xword[CO1], xmm8);
1423 xorps(xmm8, xmm8);
1424 lea(CO1, ptr[CO1 + LDC * 1]);
1425 align(4);
1426
1427 L(labels[21]);
1428 mov(A, AO);
1429 align(4);
1430
1431 L(labels[22]);
1432 test(J, 0x2);
1433 jle(labels[40], T_NEAR);
1434 mov(CO1, C);
1435 add(C, 0x8);
1436 mov(BO, B);
1437 mov(AA, K);
1438 shl(AA, 0x4);
1439 lea(AA, ptr[A + AA * 1 + 0x200]);
1440 mov(I, N);
1441 cmp(I, 0x2);
1442 jl(labels[31], T_NEAR);
1443 align(4);
1444
1445 L(labels[23]);
1446 mov(AO, A);
1447 movdqu(xmm0, xword[AO - 0x80]);
1448 movdqu(xmm5, xword[BO - 0x80]);
1449 mov(H, K);
1450 sar(H, 0x3);
1451 jle(labels[27], T_NEAR);
1452 sub(H, 0x8);
1453 jle(labels[25], T_NEAR);
1454 align(4);
1455
1456 L(labels[24]);
1457 pshufd(xmm4, xmm5, 0x0);
1458 pmaddubsw(xmm4, xmm0);
1459 pmaddwd(xmm4, xmm7);
1460 paddd(xmm8, xmm4);
1461 prefetcht0(byte[AO + 0x180]);
1462 prefetcht0(byte[BO]);
1463 pshufd(xmm4, xmm5, 0x55);
1464 pmaddubsw(xmm4, xmm0);
1465 pmaddwd(xmm4, xmm7);
1466 paddd(xmm9, xmm4);
1467 movdqu(xmm0, xword[AO - 0x78]);
1468 prefetcht0(byte[AO + 0x1c0]);
1469 pshufd(xmm4, xmm5, 0xaa);
1470 pmaddubsw(xmm4, xmm0);
1471 pmaddwd(xmm4, xmm7);
1472 paddd(xmm8, xmm4);
1473 pshufd(xmm4, xmm5, 0xff);
1474 pmaddubsw(xmm4, xmm0);
1475 pmaddwd(xmm4, xmm7);
1476 paddd(xmm9, xmm4);
1477 movdqu(xmm5, xword[BO - 0x70]);
1478 prefetcht1(byte[AA - 0x80]);
1479 movdqu(xmm0, xword[AO - 0x70]);
1480 add(AA, 0x4);
1481 add(AO, 0x10);
1482 add(BO, 0x10);
1483 sub(H, 0x1);
1484 jg(labels[24], T_NEAR);
1485 align(4);
1486
1487 L(labels[25]);
1488 prefetcht0(byte[CO1 + 0x3c]);
1489 prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
1490 add(H, 0x8);
1491 jle(labels[27], T_NEAR);
1492 align(4);
1493
1494 L(labels[26]);
1495 pshufd(xmm4, xmm5, 0x0);
1496 pmaddubsw(xmm4, xmm0);
1497 pmaddwd(xmm4, xmm7);
1498 paddd(xmm8, xmm4);
1499 prefetcht0(byte[AO + 0x180]);
1500 prefetcht0(byte[BO]);
1501 pshufd(xmm4, xmm5, 0x55);
1502 pmaddubsw(xmm4, xmm0);
1503 pmaddwd(xmm4, xmm7);
1504 paddd(xmm9, xmm4);
1505 movdqu(xmm0, xword[AO - 0x78]);
1506 prefetcht0(byte[AO + 0x1c0]);
1507 pshufd(xmm4, xmm5, 0xaa);
1508 pmaddubsw(xmm4, xmm0);
1509 pmaddwd(xmm4, xmm7);
1510 paddd(xmm8, xmm4);
1511 pshufd(xmm4, xmm5, 0xff);
1512 pmaddubsw(xmm4, xmm0);
1513 pmaddwd(xmm4, xmm7);
1514 paddd(xmm9, xmm4);
1515 movdqu(xmm5, xword[BO - 0x70]);
1516 prefetcht1(byte[AA - 0x80]);
1517 movdqu(xmm0, xword[AO - 0x70]);
1518 add(AA, 0x4);
1519 add(AO, 0x10);
1520 add(BO, 0x10);
1521 sub(H, 0x1);
1522 jg(labels[26], T_NEAR);
1523 align(4);
1524
1525 L(labels[27]);
1526 mov(H, K);
1527 test(H, 0x4);
1528 je(labels[28], T_NEAR);
1529 pshufd(xmm4, xmm5, 0x0);
1530 pmaddubsw(xmm4, xmm0);
1531 pmaddwd(xmm4, xmm7);
1532 paddd(xmm8, xmm4);
1533 pshufd(xmm4, xmm5, 0x55);
1534 pmaddubsw(xmm4, xmm0);
1535 pmaddwd(xmm4, xmm7);
1536 paddd(xmm9, xmm4);
1537 add(AO, 0x8);
1538 add(BO, 0x8);
1539 align(4);
1540
1541 L(labels[28]);
1542 mov(H, K);
1543 test(H, 0x2);
1544 je(labels[29], T_NEAR);
1545 xorps(xmm6, xmm6);
1546 movdqu(xmm0, xword[AO - 0x80]);
1547 movaps(xmm1, xmm0);
1548 punpcklwd(xmm0, xmm6);
1549 movss(xmm5, dword[BO - 0x80]);
1550 punpcklwd(xmm5, xmm5);
1551 pshufd(xmm4, xmm5, 0x0);
1552 pmaddubsw(xmm4, xmm0);
1553 pmaddwd(xmm4, xmm7);
1554 paddd(xmm8, xmm4);
1555 pshufd(xmm4, xmm5, 0x55);
1556 pmaddubsw(xmm4, xmm0);
1557 pmaddwd(xmm4, xmm7);
1558 paddd(xmm9, xmm4);
1559 add(AO, 0x4);
1560 add(BO, 0x4);
1561 align(4);
1562
1563 L(labels[29]);
1564 mov(H, K);
1565 test(H, 0x1);
1566 je(labels[30], T_NEAR);
1567 xorps(xmm6, xmm6);
1568 movdqu(xmm3, xword[AO - 0x80]);
1569 pshufd(xmm0, xmm3, 0x0);
1570 punpcklbw(xmm0, xmm6);
1571 punpcklwd(xmm0, xmm6);
1572 movd(xmm5, dword[BO - 0x80]);
1573 punpcklbw(xmm5, xmm5);
1574 punpcklwd(xmm5, xmm5);
1575 pshufd(xmm4, xmm5, 0x0);
1576 pmaddubsw(xmm4, xmm0);
1577 pmaddwd(xmm4, xmm7);
1578 paddd(xmm8, xmm4);
1579 pshufd(xmm4, xmm5, 0x55);
1580 pmaddubsw(xmm4, xmm0);
1581 pmaddwd(xmm4, xmm7);
1582 paddd(xmm9, xmm4);
1583 add(AO, 0x2);
1584 add(BO, 0x2);
1585 align(4);
1586
1587 L(labels[30]);
1588 movlps(qword[CO1], xmm8);
1589 xorps(xmm8, xmm8);
1590 movlps(qword[CO1 + LDC * 1], xmm9);
1591 xorps(xmm9, xmm9);
1592 lea(CO1, ptr[CO1 + LDC * 2]);
1593 sub(I, 0x2);
1594 cmp(I, 0x2);
1595 jge(labels[23], T_NEAR);
1596 align(4);
1597
1598 L(labels[31]);
1599 test(I, 0x1);
1600 jle(labels[39], T_NEAR);
1601 mov(AO, A);
1602 movdqu(xmm0, xword[AO - 0x80]);
1603 movdqu(xmm5, xword[BO - 0x80]);
1604 mov(H, K);
1605 sar(H, 0x3);
1606 jle(labels[35], T_NEAR);
1607 sub(H, 0x8);
1608 jle(labels[33], T_NEAR);
1609 align(4);
1610
1611 L(labels[32]);
1612 pshufd(xmm4, xmm5, 0x0);
1613 pmaddubsw(xmm4, xmm0);
1614 pmaddwd(xmm4, xmm7);
1615 paddd(xmm8, xmm4);
1616 prefetcht0(byte[AO + 0x180]);
1617 prefetcht0(byte[BO]);
1618 movdqu(xmm0, xword[AO - 0x78]);
1619 prefetcht0(byte[AO + 0x1c0]);
1620 pshufd(xmm4, xmm5, 0x55);
1621 pmaddubsw(xmm4, xmm0);
1622 pmaddwd(xmm4, xmm7);
1623 paddd(xmm8, xmm4);
1624 movdqu(xmm5, xword[BO - 0x78]);
1625 prefetcht1(byte[AA - 0x80]);
1626 movdqu(xmm0, xword[AO - 0x70]);
1627 add(AA, 0x4);
1628 add(AO, 0x10);
1629 add(BO, 0x8);
1630 sub(H, 0x1);
1631 jg(labels[32], T_NEAR);
1632 align(4);
1633
1634 L(labels[33]);
1635 prefetcht0(byte[CO1 + 0x3c]);
1636 add(H, 0x8);
1637 jle(labels[35], T_NEAR);
1638 align(4);
1639
1640 L(labels[34]);
1641 pshufd(xmm4, xmm5, 0x0);
1642 pmaddubsw(xmm4, xmm0);
1643 pmaddwd(xmm4, xmm7);
1644 paddd(xmm8, xmm4);
1645 prefetcht0(byte[AO + 0x180]);
1646 prefetcht0(byte[BO]);
1647 movdqu(xmm0, xword[AO - 0x78]);
1648 prefetcht0(byte[AO + 0x1c0]);
1649 pshufd(xmm4, xmm5, 0x55);
1650 pmaddubsw(xmm4, xmm0);
1651 pmaddwd(xmm4, xmm7);
1652 paddd(xmm8, xmm4);
1653 movdqu(xmm5, xword[BO - 0x78]);
1654 prefetcht1(byte[AA - 0x80]);
1655 movdqu(xmm0, xword[AO - 0x70]);
1656 add(AA, 0x4);
1657 add(AO, 0x10);
1658 add(BO, 0x8);
1659 sub(H, 0x1);
1660 jg(labels[34], T_NEAR);
1661 align(4);
1662
1663 L(labels[35]);
1664 mov(H, K);
1665 test(H, 0x4);
1666 je(labels[36], T_NEAR);
1667 pshufd(xmm4, xmm5, 0x0);
1668 pmaddubsw(xmm4, xmm0);
1669 pmaddwd(xmm4, xmm7);
1670 paddd(xmm8, xmm4);
1671 add(AO, 0x8);
1672 add(BO, 0x4);
1673 align(4);
1674
1675 L(labels[36]);
1676 mov(H, K);
1677 test(H, 0x2);
1678 je(labels[37], T_NEAR);
1679 xorps(xmm6, xmm6);
1680 movdqu(xmm0, xword[AO - 0x80]);
1681 movaps(xmm1, xmm0);
1682 punpcklwd(xmm0, xmm6);
1683 movss(xmm5, dword[BO - 0x80]);
1684 punpcklwd(xmm5, xmm5);
1685 pshufd(xmm4, xmm5, 0x0);
1686 pmaddubsw(xmm4, xmm0);
1687 pmaddwd(xmm4, xmm7);
1688 paddd(xmm8, xmm4);
1689 add(AO, 0x4);
1690 add(BO, 0x2);
1691 align(4);
1692
1693 L(labels[37]);
1694 mov(H, K);
1695 test(H, 0x1);
1696 je(labels[38], T_NEAR);
1697 xorps(xmm6, xmm6);
1698 movdqu(xmm3, xword[AO - 0x80]);
1699 pshufd(xmm0, xmm3, 0x0);
1700 punpcklbw(xmm0, xmm6);
1701 punpcklwd(xmm0, xmm6);
1702 movd(xmm5, dword[BO - 0x80]);
1703 punpcklbw(xmm5, xmm5);
1704 punpcklwd(xmm5, xmm5);
1705 pshufd(xmm4, xmm5, 0x0);
1706 pmaddubsw(xmm4, xmm0);
1707 pmaddwd(xmm4, xmm7);
1708 paddd(xmm8, xmm4);
1709 add(AO, 0x2);
1710 add(BO, 0x1);
1711 align(4);
1712
1713 L(labels[38]);
1714 movlps(qword[CO1], xmm8);
1715 xorps(xmm8, xmm8);
1716 lea(CO1, ptr[CO1 + LDC * 1]);
1717 align(4);
1718
1719 L(labels[39]);
1720 mov(A, AO);
1721 align(4);
1722
1723 L(labels[40]);
1724 test(J, 0x1);
1725 jle(labels[58], T_NEAR);
1726 mov(CO1, C);
1727 add(C, 0x4);
1728 mov(BO, B);
1729 mov(AA, K);
1730 shl(AA, 0x2);
1731 lea(AA, ptr[A + AA * 1 + 0x200]);
1732 mov(I, N);
1733 cmp(I, 0x2);
1734 jl(labels[49], T_NEAR);
1735 align(4);
1736
1737 L(labels[41]);
1738 mov(AO, A);
1739 movdqu(xmm0, xword[AO - 0x80]);
1740 movdqu(xmm5, xword[BO - 0x80]);
1741 mov(H, K);
1742 sar(H, 0x3);
1743 jle(labels[45], T_NEAR);
1744 sub(H, 0x8);
1745 jle(labels[43], T_NEAR);
1746 align(4);
1747
1748 L(labels[42]);
1749 pshufd(xmm4, xmm5, 0x0);
1750 pmaddubsw(xmm4, xmm0);
1751 pmaddwd(xmm4, xmm7);
1752 paddd(xmm8, xmm4);
1753 prefetcht0(byte[AO + 0x180]);
1754 prefetcht0(byte[BO]);
1755 pshufd(xmm4, xmm5, 0x55);
1756 pmaddubsw(xmm4, xmm0);
1757 pmaddwd(xmm4, xmm7);
1758 paddd(xmm9, xmm4);
1759 movdqu(xmm0, xword[AO - 0x7c]);
1760 prefetcht0(byte[AO + 0x1c0]);
1761 pshufd(xmm4, xmm5, 0xaa);
1762 pmaddubsw(xmm4, xmm0);
1763 pmaddwd(xmm4, xmm7);
1764 paddd(xmm8, xmm4);
1765 pshufd(xmm4, xmm5, 0xff);
1766 pmaddubsw(xmm4, xmm0);
1767 pmaddwd(xmm4, xmm7);
1768 paddd(xmm9, xmm4);
1769 movdqu(xmm5, xword[BO - 0x70]);
1770 prefetcht1(byte[AA - 0x80]);
1771 movdqu(xmm0, xword[AO - 0x78]);
1772 add(AA, 0x4);
1773 add(AO, 0x8);
1774 add(BO, 0x10);
1775 sub(H, 0x1);
1776 jg(labels[42], T_NEAR);
1777 align(4);
1778
1779 L(labels[43]);
1780 prefetcht0(byte[CO1 + 0x3c]);
1781 prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
1782 add(H, 0x8);
1783 jle(labels[45], T_NEAR);
1784 align(4);
1785
1786 L(labels[44]);
1787 pshufd(xmm4, xmm5, 0x0);
1788 pmaddubsw(xmm4, xmm0);
1789 pmaddwd(xmm4, xmm7);
1790 paddd(xmm8, xmm4);
1791 prefetcht0(byte[AO + 0x180]);
1792 prefetcht0(byte[BO]);
1793 pshufd(xmm4, xmm5, 0x55);
1794 pmaddubsw(xmm4, xmm0);
1795 pmaddwd(xmm4, xmm7);
1796 paddd(xmm9, xmm4);
1797 movdqu(xmm0, xword[AO - 0x7c]);
1798 prefetcht0(byte[AO + 0x1c0]);
1799 pshufd(xmm4, xmm5, 0xaa);
1800 pmaddubsw(xmm4, xmm0);
1801 pmaddwd(xmm4, xmm7);
1802 paddd(xmm8, xmm4);
1803 pshufd(xmm4, xmm5, 0xff);
1804 pmaddubsw(xmm4, xmm0);
1805 pmaddwd(xmm4, xmm7);
1806 paddd(xmm9, xmm4);
1807 movdqu(xmm5, xword[BO - 0x70]);
1808 prefetcht1(byte[AA - 0x80]);
1809 movdqu(xmm0, xword[AO - 0x78]);
1810 add(AA, 0x4);
1811 add(AO, 0x8);
1812 add(BO, 0x10);
1813 sub(H, 0x1);
1814 jg(labels[44], T_NEAR);
1815 align(4);
1816
1817 L(labels[45]);
1818 mov(H, K);
1819 test(H, 0x4);
1820 je(labels[46], T_NEAR);
1821 pshufd(xmm4, xmm5, 0x0);
1822 pmaddubsw(xmm4, xmm0);
1823 pmaddwd(xmm4, xmm7);
1824 paddd(xmm8, xmm4);
1825 pshufd(xmm4, xmm5, 0x55);
1826 pmaddubsw(xmm4, xmm0);
1827 pmaddwd(xmm4, xmm7);
1828 paddd(xmm9, xmm4);
1829 add(AO, 0x4);
1830 add(BO, 0x8);
1831 align(4);
1832
1833 L(labels[46]);
1834 mov(H, K);
1835 test(H, 0x2);
1836 je(labels[47], T_NEAR);
1837 xorps(xmm6, xmm6);
1838 movdqu(xmm0, xword[AO - 0x80]);
1839 movaps(xmm1, xmm0);
1840 punpcklwd(xmm0, xmm6);
1841 movss(xmm5, dword[BO - 0x80]);
1842 punpcklwd(xmm5, xmm5);
1843 pshufd(xmm4, xmm5, 0x0);
1844 pmaddubsw(xmm4, xmm0);
1845 pmaddwd(xmm4, xmm7);
1846 paddd(xmm8, xmm4);
1847 pshufd(xmm4, xmm5, 0x55);
1848 pmaddubsw(xmm4, xmm0);
1849 pmaddwd(xmm4, xmm7);
1850 paddd(xmm9, xmm4);
1851 add(AO, 0x2);
1852 add(BO, 0x4);
1853 align(4);
1854
1855 L(labels[47]);
1856 mov(H, K);
1857 test(H, 0x1);
1858 je(labels[48], T_NEAR);
1859 xorps(xmm6, xmm6);
1860 movdqu(xmm3, xword[AO - 0x80]);
1861 pshufd(xmm0, xmm3, 0x0);
1862 punpcklbw(xmm0, xmm6);
1863 punpcklwd(xmm0, xmm6);
1864 movd(xmm5, dword[BO - 0x80]);
1865 punpcklbw(xmm5, xmm5);
1866 punpcklwd(xmm5, xmm5);
1867 pshufd(xmm4, xmm5, 0x0);
1868 pmaddubsw(xmm4, xmm0);
1869 pmaddwd(xmm4, xmm7);
1870 paddd(xmm8, xmm4);
1871 pshufd(xmm4, xmm5, 0x55);
1872 pmaddubsw(xmm4, xmm0);
1873 pmaddwd(xmm4, xmm7);
1874 paddd(xmm9, xmm4);
1875 add(AO, 0x1);
1876 add(BO, 0x2);
1877 align(4);
1878
1879 L(labels[48]);
1880 movss(dword[CO1], xmm8);
1881 xorps(xmm8, xmm8);
1882 movss(dword[CO1 + LDC * 1], xmm9);
1883 xorps(xmm9, xmm9);
1884 lea(CO1, ptr[CO1 + LDC * 2]);
1885 sub(I, 0x2);
1886 cmp(I, 0x2);
1887 jge(labels[41], T_NEAR);
1888 align(4);
1889
1890 L(labels[49]);
1891 test(I, 0x1);
1892 jle(labels[57], T_NEAR);
1893 mov(AO, A);
1894 movdqu(xmm0, xword[AO - 0x80]);
1895 movdqu(xmm5, xword[BO - 0x80]);
1896 mov(H, K);
1897 sar(H, 0x3);
1898 jle(labels[53], T_NEAR);
1899 sub(H, 0x8);
1900 jle(labels[51], T_NEAR);
1901 align(4);
1902
1903 L(labels[50]);
1904 pshufd(xmm4, xmm5, 0x0);
1905 pmaddubsw(xmm4, xmm0);
1906 pmaddwd(xmm4, xmm7);
1907 paddd(xmm8, xmm4);
1908 prefetcht0(byte[AO + 0x180]);
1909 prefetcht0(byte[BO]);
1910 movdqu(xmm0, xword[AO - 0x7c]);
1911 prefetcht0(byte[AO + 0x1c0]);
1912 pshufd(xmm4, xmm5, 0x55);
1913 pmaddubsw(xmm4, xmm0);
1914 pmaddwd(xmm4, xmm7);
1915 paddd(xmm8, xmm4);
1916 movdqu(xmm5, xword[BO - 0x78]);
1917 prefetcht1(byte[AA - 0x80]);
1918 movdqu(xmm0, xword[AO - 0x78]);
1919 add(AA, 0x4);
1920 add(AO, 0x8);
1921 add(BO, 0x8);
1922 sub(H, 0x1);
1923 jg(labels[50], T_NEAR);
1924 align(4);
1925
1926 L(labels[51]);
1927 prefetcht0(byte[CO1 + 0x3c]);
1928 add(H, 0x8);
1929 jle(labels[53], T_NEAR);
1930 align(4);
1931
1932 L(labels[52]);
1933 pshufd(xmm4, xmm5, 0x0);
1934 pmaddubsw(xmm4, xmm0);
1935 pmaddwd(xmm4, xmm7);
1936 paddd(xmm8, xmm4);
1937 prefetcht0(byte[AO + 0x180]);
1938 prefetcht0(byte[BO]);
1939 movdqu(xmm0, xword[AO - 0x7c]);
1940 prefetcht0(byte[AO + 0x1c0]);
1941 pshufd(xmm4, xmm5, 0x55);
1942 pmaddubsw(xmm4, xmm0);
1943 pmaddwd(xmm4, xmm7);
1944 paddd(xmm8, xmm4);
1945 movdqu(xmm5, xword[BO - 0x78]);
1946 prefetcht1(byte[AA - 0x80]);
1947 movdqu(xmm0, xword[AO - 0x78]);
1948 add(AA, 0x4);
1949 add(AO, 0x8);
1950 add(BO, 0x8);
1951 sub(H, 0x1);
1952 jg(labels[52], T_NEAR);
1953 align(4);
1954
1955 L(labels[53]);
1956 mov(H, K);
1957 test(H, 0x4);
1958 je(labels[54], T_NEAR);
1959 pshufd(xmm4, xmm5, 0x0);
1960 pmaddubsw(xmm4, xmm0);
1961 pmaddwd(xmm4, xmm7);
1962 paddd(xmm8, xmm4);
1963 add(AO, 0x4);
1964 add(BO, 0x4);
1965 align(4);
1966
1967 L(labels[54]);
1968 mov(H, K);
1969 test(H, 0x2);
1970 je(labels[55], T_NEAR);
1971 xorps(xmm6, xmm6);
1972 movdqu(xmm0, xword[AO - 0x80]);
1973 movaps(xmm1, xmm0);
1974 punpcklwd(xmm0, xmm6);
1975 movss(xmm5, dword[BO - 0x80]);
1976 punpcklwd(xmm5, xmm5);
1977 pshufd(xmm4, xmm5, 0x0);
1978 pmaddubsw(xmm4, xmm0);
1979 pmaddwd(xmm4, xmm7);
1980 paddd(xmm8, xmm4);
1981 add(AO, 0x2);
1982 add(BO, 0x2);
1983 align(4);
1984
1985 L(labels[55]);
1986 mov(H, K);
1987 test(H, 0x1);
1988 je(labels[56], T_NEAR);
1989 xorps(xmm6, xmm6);
1990 movdqu(xmm3, xword[AO - 0x80]);
1991 pshufd(xmm0, xmm3, 0x0);
1992 punpcklbw(xmm0, xmm6);
1993 punpcklwd(xmm0, xmm6);
1994 movd(xmm5, dword[BO - 0x80]);
1995 punpcklbw(xmm5, xmm5);
1996 punpcklwd(xmm5, xmm5);
1997 pshufd(xmm4, xmm5, 0x0);
1998 pmaddubsw(xmm4, xmm0);
1999 pmaddwd(xmm4, xmm7);
2000 paddd(xmm8, xmm4);
2001 add(AO, 0x1);
2002 add(BO, 0x1);
2003 align(4);
2004
2005 L(labels[56]);
2006 movss(dword[CO1], xmm8);
2007 xorps(xmm8, xmm8);
2008 lea(CO1, ptr[CO1 + LDC * 1]);
2009 align(4);
2010
2011 L(labels[57]);
2012 mov(A, AO);
2013 align(4);
2014
2015 L(labels[58]);
2016 add(rsp, stack_alloc_size);
2017 postamble();
2018 }
2019 outLocalLabel();
2020
2021 #undef M
2022 #undef N
2023 #undef K
2024 #undef A
2025 #undef B
2026 #undef C
2027 #undef LDC
2028 #undef AA
2029 #undef I
2030 #undef J
2031 #undef H
2032 #undef AO
2033 #undef BO
2034 #undef CO1
2035 #undef CO2
2036 #ifdef _WIN32
2037 #undef ARG_A
2038 #undef ARG_B
2039 #endif
2040 #undef ARG_C
2041 #undef ARG_LDC
2042 }
2043
2044 } // namespace x64
2045 } // namespace cpu
2046 } // namespace impl
2047 } // namespace dnnl
2048