1 /*******************************************************************************
2 * Copyright 2019-2021 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16
17 #include "cpu/x64/jit_generator.hpp"
18
19 #include "cpu/x64/gemm/f32/common_f32.hpp"
20
21 namespace dnnl {
22 namespace impl {
23 namespace cpu {
24 namespace x64 {
25
jit_sse41_kernel_b0_sgemm_kern()26 jit_sse41_kernel_b0_sgemm_kern::jit_sse41_kernel_b0_sgemm_kern()
27 : jit_generator(nullptr, F32_COMPUTE_KERNEL_CODE_SIZE) {}
28
generate()29 void jit_sse41_kernel_b0_sgemm_kern::generate() {
30
31 #ifndef _WIN32
32
33 #define M rdi
34 #define N rsi
35 #define K rdx
36 #define A r8
37 #define B r9
38 #define C rcx
39 #define LDC r10
40
41 #define AA r15
42 #define I r11
43 #define J r12
44 #define H rax
45 #define AO rbx
46 #define BO rbp
47 #define CO1 r13
48 #define CO2 r14
49
50 #define OLD_C (8 + stacksize + rsp)
51 #define OLD_LDC (16 + stacksize + rsp)
52
53 #else
54
55 #define M rcx
56 #define N rdx
57 #define K r8
58 #define A rdi
59 #define B rsi
60 #define C r9
61 #define LDC r10
62 #define AA r15
63 #define I r11
64 #define J r12
65 #define H rax
66 #define AO rbx
67 #define BO rbp
68 #define CO1 r13
69 #define CO2 r14
70
71 #define OLD_A 40 + stacksize + rsp
72 #define OLD_B 48 + stacksize + rsp
73 #define OLD_C 56 + stacksize + rsp
74 #define OLD_LDC 64 + stacksize + rsp
75
76 #endif
77
78 inLocalLabel();
79 {
80 std::vector<Xbyak::Label> labels(93);
81 preamble();
82 auto stacksize = get_size_of_abi_save_regs();
83 #ifdef _WIN32
84 mov(A, ptr[OLD_A]);
85 mov(B, ptr[OLD_B]);
86 #endif
87 mov(C, ptr[OLD_C]);
88 mov(LDC, ptr[OLD_LDC]);
89
90 mov(M, qword[M]);
91 mov(N, qword[N]);
92 mov(K, qword[K]);
93 shl(LDC, 0x2);
94 sub(A, -128);
95 sub(B, -128);
96 mov(J, M);
97 cmp(J, 0x8);
98 jl(labels[90], T_NEAR);
99 align(4);
100
101 L(labels[72]);
102 mov(AA, K);
103 imul(AA, AA, 0x20);
104 add(AA, A);
105 mov(CO1, C);
106 add(C, 0x20);
107 mov(BO, B);
108 mov(I, N);
109 cmp(I, 0x4);
110 jl(labels[73], T_NEAR);
111 align(4);
112
113 L(labels[75]);
114 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
115 movups(xmm0, xword[A - 0x80]);
116 xorps(xmm8, xmm8);
117 movups(xmm1, xword[A - 0x70]);
118 xorps(xmm9, xmm9);
119 movups(xmm2, xword[A - 0x60]);
120 xorps(xmm10, xmm10);
121 movups(xmm3, xword[A - 0x50]);
122 xorps(xmm11, xmm11);
123 movaps(xmm4, xword[BO - 0x80]);
124 xorps(xmm12, xmm12);
125 movaps(xmm5, xword[BO - 0x70]);
126 xorps(xmm13, xmm13);
127 xorps(xmm14, xmm14);
128 xorps(xmm15, xmm15);
129 mov(AO, A);
130 mov(H, K);
131 sar(H, 0x2);
132 jle(labels[69], T_NEAR);
133 sub(H, 0x1e);
134 jle(labels[57], T_NEAR);
135 align(4);
136
137 L(labels[85]);
138 prefetcht0(byte[AO + 0x180]);
139 prefetcht0(byte[BO + 0x100]);
140 pshufd(xmm6, xmm4, 0xb1);
141 movaps(xmm7, xmm4);
142 mulps(xmm4, xmm0);
143 mulps(xmm7, xmm1);
144 addps(xmm8, xmm4);
145 addps(xmm12, xmm7);
146 pshufd(xmm4, xmm6, 0x1b);
147 movaps(xmm7, xmm6);
148 mulps(xmm6, xmm0);
149 mulps(xmm7, xmm1);
150 addps(xmm9, xmm6);
151 addps(xmm13, xmm7);
152 pshufd(xmm6, xmm4, 0xb1);
153 movaps(xmm7, xmm4);
154 mulps(xmm4, xmm0);
155 mulps(xmm7, xmm1);
156 addps(xmm10, xmm4);
157 movaps(xmm4, xword[BO - 0x60]);
158 addps(xmm14, xmm7);
159 movaps(xmm7, xmm6);
160 mulps(xmm6, xmm0);
161 movups(xmm0, xword[AO - 0x40]);
162 mulps(xmm7, xmm1);
163 movups(xmm1, xword[AO - 0x30]);
164 addps(xmm11, xmm6);
165 addps(xmm15, xmm7);
166 pshufd(xmm6, xmm5, 0xb1);
167 movaps(xmm7, xmm5);
168 mulps(xmm5, xmm2);
169 mulps(xmm7, xmm3);
170 addps(xmm8, xmm5);
171 addps(xmm12, xmm7);
172 pshufd(xmm5, xmm6, 0x1b);
173 movaps(xmm7, xmm6);
174 mulps(xmm6, xmm2);
175 mulps(xmm7, xmm3);
176 addps(xmm9, xmm6);
177 addps(xmm13, xmm7);
178 pshufd(xmm6, xmm5, 0xb1);
179 movaps(xmm7, xmm5);
180 mulps(xmm5, xmm2);
181 mulps(xmm7, xmm3);
182 addps(xmm10, xmm5);
183 movaps(xmm5, xword[BO - 0x50]);
184 addps(xmm14, xmm7);
185 movaps(xmm7, xmm6);
186 mulps(xmm6, xmm2);
187 movups(xmm2, xword[AO - 0x20]);
188 mulps(xmm7, xmm3);
189 movups(xmm3, xword[AO - 0x10]);
190 addps(xmm11, xmm6);
191 addps(xmm15, xmm7);
192 pshufd(xmm6, xmm4, 0xb1);
193 prefetcht0(byte[AO + 0x1c0]);
194 movaps(xmm7, xmm4);
195 mulps(xmm4, xmm0);
196 mulps(xmm7, xmm1);
197 addps(xmm8, xmm4);
198 addps(xmm12, xmm7);
199 pshufd(xmm4, xmm6, 0x1b);
200 movaps(xmm7, xmm6);
201 mulps(xmm6, xmm0);
202 mulps(xmm7, xmm1);
203 addps(xmm9, xmm6);
204 addps(xmm13, xmm7);
205 pshufd(xmm6, xmm4, 0xb1);
206 movaps(xmm7, xmm4);
207 mulps(xmm4, xmm0);
208 mulps(xmm7, xmm1);
209 addps(xmm10, xmm4);
210 movaps(xmm4, xword[BO - 0x40]);
211 addps(xmm14, xmm7);
212 movaps(xmm7, xmm6);
213 mulps(xmm6, xmm0);
214 movups(xmm0, xword[AO]);
215 mulps(xmm7, xmm1);
216 movups(xmm1, xword[AO + 0x10]);
217 addps(xmm11, xmm6);
218 addps(xmm15, xmm7);
219 pshufd(xmm6, xmm5, 0xb1);
220 movaps(xmm7, xmm5);
221 mulps(xmm5, xmm2);
222 mulps(xmm7, xmm3);
223 addps(xmm8, xmm5);
224 addps(xmm12, xmm7);
225 pshufd(xmm5, xmm6, 0x1b);
226 movaps(xmm7, xmm6);
227 mulps(xmm6, xmm2);
228 mulps(xmm7, xmm3);
229 addps(xmm9, xmm6);
230 addps(xmm13, xmm7);
231 pshufd(xmm6, xmm5, 0xb1);
232 movaps(xmm7, xmm5);
233 mulps(xmm5, xmm2);
234 mulps(xmm7, xmm3);
235 addps(xmm10, xmm5);
236 movaps(xmm5, xword[BO - 0x30]);
237 addps(xmm14, xmm7);
238 add(AA, 0x8);
239 sub(BO, -64);
240 movaps(xmm7, xmm6);
241 mulps(xmm6, xmm2);
242 movups(xmm2, xword[AO + 0x20]);
243 mulps(xmm7, xmm3);
244 movups(xmm3, xword[AO + 0x30]);
245 sub(AO, -128);
246 addps(xmm11, xmm6);
247 addps(xmm15, xmm7);
248 prefetcht0(byte[AA - 0x78]);
249 sub(H, 0x1);
250 jg(labels[85], T_NEAR);
251 align(4);
252
253 L(labels[57]);
254 prefetcht0(byte[CO1 + 0x1c]);
255 prefetcht0(byte[CO1 + LDC * 1 + 0x1c]);
256 prefetcht0(byte[CO2 + 0x1c]);
257 prefetcht0(byte[CO2 + LDC * 1 + 0x1c]);
258 add(H, 0x1e);
259 align(4);
260
261 L(labels[61]);
262 prefetcht0(byte[AO + 0x180]);
263 prefetcht0(byte[BO + 0x100]);
264 pshufd(xmm6, xmm4, 0xb1);
265 movaps(xmm7, xmm4);
266 mulps(xmm4, xmm0);
267 mulps(xmm7, xmm1);
268 addps(xmm8, xmm4);
269 addps(xmm12, xmm7);
270 pshufd(xmm4, xmm6, 0x1b);
271 movaps(xmm7, xmm6);
272 mulps(xmm6, xmm0);
273 mulps(xmm7, xmm1);
274 addps(xmm9, xmm6);
275 addps(xmm13, xmm7);
276 pshufd(xmm6, xmm4, 0xb1);
277 movaps(xmm7, xmm4);
278 mulps(xmm4, xmm0);
279 mulps(xmm7, xmm1);
280 addps(xmm10, xmm4);
281 movaps(xmm4, xword[BO - 0x60]);
282 addps(xmm14, xmm7);
283 movaps(xmm7, xmm6);
284 mulps(xmm6, xmm0);
285 movups(xmm0, xword[AO - 0x40]);
286 mulps(xmm7, xmm1);
287 movups(xmm1, xword[AO - 0x30]);
288 addps(xmm11, xmm6);
289 addps(xmm15, xmm7);
290 pshufd(xmm6, xmm5, 0xb1);
291 movaps(xmm7, xmm5);
292 mulps(xmm5, xmm2);
293 mulps(xmm7, xmm3);
294 addps(xmm8, xmm5);
295 addps(xmm12, xmm7);
296 pshufd(xmm5, xmm6, 0x1b);
297 movaps(xmm7, xmm6);
298 mulps(xmm6, xmm2);
299 mulps(xmm7, xmm3);
300 addps(xmm9, xmm6);
301 addps(xmm13, xmm7);
302 pshufd(xmm6, xmm5, 0xb1);
303 movaps(xmm7, xmm5);
304 mulps(xmm5, xmm2);
305 mulps(xmm7, xmm3);
306 addps(xmm10, xmm5);
307 movaps(xmm5, xword[BO - 0x50]);
308 addps(xmm14, xmm7);
309 movaps(xmm7, xmm6);
310 mulps(xmm6, xmm2);
311 movups(xmm2, xword[AO - 0x20]);
312 mulps(xmm7, xmm3);
313 movups(xmm3, xword[AO - 0x10]);
314 addps(xmm11, xmm6);
315 addps(xmm15, xmm7);
316 pshufd(xmm6, xmm4, 0xb1);
317 prefetcht0(byte[AO + 0x1c0]);
318 movaps(xmm7, xmm4);
319 mulps(xmm4, xmm0);
320 mulps(xmm7, xmm1);
321 addps(xmm8, xmm4);
322 addps(xmm12, xmm7);
323 pshufd(xmm4, xmm6, 0x1b);
324 movaps(xmm7, xmm6);
325 mulps(xmm6, xmm0);
326 mulps(xmm7, xmm1);
327 addps(xmm9, xmm6);
328 addps(xmm13, xmm7);
329 pshufd(xmm6, xmm4, 0xb1);
330 movaps(xmm7, xmm4);
331 mulps(xmm4, xmm0);
332 mulps(xmm7, xmm1);
333 addps(xmm10, xmm4);
334 movaps(xmm4, xword[BO - 0x40]);
335 addps(xmm14, xmm7);
336 movaps(xmm7, xmm6);
337 mulps(xmm6, xmm0);
338 movups(xmm0, xword[AO]);
339 mulps(xmm7, xmm1);
340 movups(xmm1, xword[AO + 0x10]);
341 addps(xmm11, xmm6);
342 addps(xmm15, xmm7);
343 pshufd(xmm6, xmm5, 0xb1);
344 movaps(xmm7, xmm5);
345 mulps(xmm5, xmm2);
346 mulps(xmm7, xmm3);
347 addps(xmm8, xmm5);
348 addps(xmm12, xmm7);
349 pshufd(xmm5, xmm6, 0x1b);
350 movaps(xmm7, xmm6);
351 mulps(xmm6, xmm2);
352 mulps(xmm7, xmm3);
353 addps(xmm9, xmm6);
354 addps(xmm13, xmm7);
355 pshufd(xmm6, xmm5, 0xb1);
356 movaps(xmm7, xmm5);
357 mulps(xmm5, xmm2);
358 mulps(xmm7, xmm3);
359 addps(xmm10, xmm5);
360 movaps(xmm5, xword[BO - 0x30]);
361 addps(xmm14, xmm7);
362 add(AA, 0x8);
363 sub(BO, -64);
364 movaps(xmm7, xmm6);
365 mulps(xmm6, xmm2);
366 movups(xmm2, xword[AO + 0x20]);
367 mulps(xmm7, xmm3);
368 movups(xmm3, xword[AO + 0x30]);
369 sub(AO, -128);
370 addps(xmm11, xmm6);
371 addps(xmm15, xmm7);
372 prefetcht0(byte[AA - 0x78]);
373 sub(H, 0x1);
374 jg(labels[61], T_NEAR);
375 align(4);
376
377 L(labels[69]);
378 mov(H, K);
379 and_(H, 0x3);
380 je(labels[71], T_NEAR);
381 align(4);
382
383 L(labels[70]);
384 pshufd(xmm6, xmm4, 0xb1);
385 movaps(xmm7, xmm4);
386 mulps(xmm4, xmm0);
387 mulps(xmm7, xmm1);
388 addps(xmm8, xmm4);
389 addps(xmm12, xmm7);
390 pshufd(xmm4, xmm6, 0x1b);
391 movaps(xmm7, xmm6);
392 mulps(xmm6, xmm0);
393 mulps(xmm7, xmm1);
394 addps(xmm9, xmm6);
395 addps(xmm13, xmm7);
396 pshufd(xmm6, xmm4, 0xb1);
397 movaps(xmm7, xmm4);
398 mulps(xmm4, xmm0);
399 mulps(xmm7, xmm1);
400 addps(xmm10, xmm4);
401 movaps(xmm4, xword[BO - 0x70]);
402 addps(xmm14, xmm7);
403 movaps(xmm7, xmm6);
404 mulps(xmm6, xmm0);
405 movups(xmm0, xword[AO - 0x60]);
406 mulps(xmm7, xmm1);
407 movups(xmm1, xword[AO - 0x50]);
408 addps(xmm11, xmm6);
409 addps(xmm15, xmm7);
410 sub(AO, -32);
411 sub(BO, -16);
412 dec(H);
413 jg(labels[70], T_NEAR);
414 align(4);
415
416 L(labels[71]);
417 movaps(xmm0, xmm8);
418 unpcklpd(xmm8, xmm9);
419 unpckhpd(xmm0, xmm9);
420 movaps(xmm1, xmm10);
421 unpckhpd(xmm10, xmm11);
422 unpcklpd(xmm1, xmm11);
423 movaps(xmm9, xmm8);
424 shufps(xmm8, xmm10, 0xcc);
425 shufps(xmm9, xmm10, 0x66);
426 movaps(xmm10, xmm1);
427 movaps(xmm11, xmm1);
428 shufps(xmm10, xmm0, 0xcc);
429 shufps(xmm11, xmm0, 0x66);
430 movaps(xmm0, xmm12);
431 unpcklpd(xmm12, xmm13);
432 unpckhpd(xmm0, xmm13);
433 movaps(xmm1, xmm14);
434 unpckhpd(xmm14, xmm15);
435 unpcklpd(xmm1, xmm15);
436 movaps(xmm13, xmm12);
437 shufps(xmm12, xmm14, 0xcc);
438 shufps(xmm13, xmm14, 0x66);
439 movaps(xmm14, xmm1);
440 movaps(xmm15, xmm1);
441 shufps(xmm14, xmm0, 0xcc);
442 shufps(xmm15, xmm0, 0x66);
443 movups(xword[CO1 + 0x0], xmm8);
444 movups(xword[CO1 + 0x10], xmm12);
445 movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
446 movups(xword[CO1 + LDC * 1 + 0x10], xmm13);
447 movups(xword[CO2], xmm10);
448 movups(xword[CO2 + 0x10], xmm14);
449 movups(xword[CO2 + LDC * 1], xmm11);
450 movups(xword[CO2 + LDC * 1 + 0x10], xmm15);
451 lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
452 lea(CO2, ptr[CO2 + LDC * 4]);
453 sub(I, 0x4);
454 cmp(I, 0x4);
455 jge(labels[75], T_NEAR);
456 align(4);
457
458 L(labels[73]);
459 test(I, 0x2);
460 jle(labels[81], T_NEAR);
461 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
462 movups(xmm0, xword[A - 0x80]);
463 xorps(xmm8, xmm8);
464 movups(xmm1, xword[A - 0x70]);
465 xorps(xmm9, xmm9);
466 movups(xmm2, xword[A - 0x60]);
467 xorps(xmm10, xmm10);
468 movups(xmm3, xword[A - 0x50]);
469 xorps(xmm11, xmm11);
470 movddup(xmm4, qword[BO - 0x80]);
471 xorps(xmm12, xmm12);
472 movddup(xmm5, qword[BO - 0x78]);
473 xorps(xmm13, xmm13);
474 xorps(xmm14, xmm14);
475 xorps(xmm15, xmm15);
476 mov(AO, A);
477 mov(H, K);
478 sar(H, 0x2);
479 jle(labels[78], T_NEAR);
480 sub(H, 0x1e);
481 jle(labels[76], T_NEAR);
482 align(4);
483
484 L(labels[74]);
485 prefetcht0(byte[AO + 0x180]);
486 prefetcht0(byte[BO + 0x100]);
487 pshufd(xmm6, xmm4, 0xb1);
488 movaps(xmm7, xmm4);
489 mulps(xmm4, xmm0);
490 mulps(xmm7, xmm1);
491 addps(xmm8, xmm4);
492 addps(xmm12, xmm7);
493 pshufd(xmm4, xmm6, 0x1b);
494 movaps(xmm7, xmm6);
495 mulps(xmm6, xmm0);
496 mulps(xmm7, xmm1);
497 addps(xmm9, xmm6);
498 addps(xmm13, xmm7);
499 pshufd(xmm6, xmm4, 0xb1);
500 movaps(xmm7, xmm4);
501 mulps(xmm4, xmm0);
502 mulps(xmm7, xmm1);
503 addps(xmm10, xmm4);
504 movddup(xmm4, qword[BO - 0x70]);
505 addps(xmm14, xmm7);
506 movaps(xmm7, xmm6);
507 mulps(xmm6, xmm0);
508 movups(xmm0, xword[AO - 0x40]);
509 mulps(xmm7, xmm1);
510 movups(xmm1, xword[AO - 0x30]);
511 addps(xmm11, xmm6);
512 addps(xmm15, xmm7);
513 pshufd(xmm6, xmm5, 0xb1);
514 movaps(xmm7, xmm5);
515 mulps(xmm5, xmm2);
516 mulps(xmm7, xmm3);
517 addps(xmm8, xmm5);
518 addps(xmm12, xmm7);
519 pshufd(xmm5, xmm6, 0x1b);
520 movaps(xmm7, xmm6);
521 mulps(xmm6, xmm2);
522 mulps(xmm7, xmm3);
523 addps(xmm9, xmm6);
524 addps(xmm13, xmm7);
525 pshufd(xmm6, xmm5, 0xb1);
526 movaps(xmm7, xmm5);
527 mulps(xmm5, xmm2);
528 mulps(xmm7, xmm3);
529 addps(xmm10, xmm5);
530 movddup(xmm5, qword[BO - 0x68]);
531 addps(xmm14, xmm7);
532 movaps(xmm7, xmm6);
533 mulps(xmm6, xmm2);
534 movups(xmm2, xword[AO - 0x20]);
535 mulps(xmm7, xmm3);
536 movups(xmm3, xword[AO - 0x10]);
537 addps(xmm11, xmm6);
538 addps(xmm15, xmm7);
539 pshufd(xmm6, xmm4, 0xb1);
540 prefetcht0(byte[AO + 0x1c0]);
541 movaps(xmm7, xmm4);
542 mulps(xmm4, xmm0);
543 mulps(xmm7, xmm1);
544 addps(xmm8, xmm4);
545 addps(xmm12, xmm7);
546 pshufd(xmm4, xmm6, 0x1b);
547 movaps(xmm7, xmm6);
548 mulps(xmm6, xmm0);
549 mulps(xmm7, xmm1);
550 addps(xmm9, xmm6);
551 addps(xmm13, xmm7);
552 pshufd(xmm6, xmm4, 0xb1);
553 movaps(xmm7, xmm4);
554 mulps(xmm4, xmm0);
555 mulps(xmm7, xmm1);
556 addps(xmm10, xmm4);
557 movddup(xmm4, qword[BO - 0x60]);
558 addps(xmm14, xmm7);
559 movaps(xmm7, xmm6);
560 mulps(xmm6, xmm0);
561 movups(xmm0, xword[AO]);
562 mulps(xmm7, xmm1);
563 movups(xmm1, xword[AO + 0x10]);
564 addps(xmm11, xmm6);
565 addps(xmm15, xmm7);
566 pshufd(xmm6, xmm5, 0xb1);
567 movaps(xmm7, xmm5);
568 mulps(xmm5, xmm2);
569 mulps(xmm7, xmm3);
570 addps(xmm8, xmm5);
571 addps(xmm12, xmm7);
572 pshufd(xmm5, xmm6, 0x1b);
573 movaps(xmm7, xmm6);
574 mulps(xmm6, xmm2);
575 mulps(xmm7, xmm3);
576 addps(xmm9, xmm6);
577 addps(xmm13, xmm7);
578 pshufd(xmm6, xmm5, 0xb1);
579 movaps(xmm7, xmm5);
580 mulps(xmm5, xmm2);
581 mulps(xmm7, xmm3);
582 addps(xmm10, xmm5);
583 movddup(xmm5, qword[BO - 0x58]);
584 addps(xmm14, xmm7);
585 add(AA, 0x8);
586 sub(BO, -32);
587 movaps(xmm7, xmm6);
588 mulps(xmm6, xmm2);
589 movups(xmm2, xword[AO + 0x20]);
590 mulps(xmm7, xmm3);
591 movups(xmm3, xword[AO + 0x30]);
592 sub(AO, -128);
593 addps(xmm11, xmm6);
594 addps(xmm15, xmm7);
595 prefetcht0(byte[AA - 0x78]);
596 sub(H, 0x1);
597 jg(labels[74], T_NEAR);
598 align(4);
599
600 L(labels[76]);
601 prefetcht0(byte[CO1 + 0x1c]);
602 prefetcht0(byte[CO1 + LDC * 1 + 0x1c]);
603 add(H, 0x1e);
604 align(4);
605
606 L(labels[77]);
607 prefetcht0(byte[AO + 0x180]);
608 prefetcht0(byte[BO + 0x100]);
609 pshufd(xmm6, xmm4, 0xb1);
610 movaps(xmm7, xmm4);
611 mulps(xmm4, xmm0);
612 mulps(xmm7, xmm1);
613 addps(xmm8, xmm4);
614 addps(xmm12, xmm7);
615 pshufd(xmm4, xmm6, 0x1b);
616 movaps(xmm7, xmm6);
617 mulps(xmm6, xmm0);
618 mulps(xmm7, xmm1);
619 addps(xmm9, xmm6);
620 addps(xmm13, xmm7);
621 pshufd(xmm6, xmm4, 0xb1);
622 movaps(xmm7, xmm4);
623 mulps(xmm4, xmm0);
624 mulps(xmm7, xmm1);
625 addps(xmm10, xmm4);
626 movddup(xmm4, qword[BO - 0x70]);
627 addps(xmm14, xmm7);
628 movaps(xmm7, xmm6);
629 mulps(xmm6, xmm0);
630 movups(xmm0, xword[AO - 0x40]);
631 mulps(xmm7, xmm1);
632 movups(xmm1, xword[AO - 0x30]);
633 addps(xmm11, xmm6);
634 addps(xmm15, xmm7);
635 pshufd(xmm6, xmm5, 0xb1);
636 movaps(xmm7, xmm5);
637 mulps(xmm5, xmm2);
638 mulps(xmm7, xmm3);
639 addps(xmm8, xmm5);
640 addps(xmm12, xmm7);
641 pshufd(xmm5, xmm6, 0x1b);
642 movaps(xmm7, xmm6);
643 mulps(xmm6, xmm2);
644 mulps(xmm7, xmm3);
645 addps(xmm9, xmm6);
646 addps(xmm13, xmm7);
647 pshufd(xmm6, xmm5, 0xb1);
648 movaps(xmm7, xmm5);
649 mulps(xmm5, xmm2);
650 mulps(xmm7, xmm3);
651 addps(xmm10, xmm5);
652 movddup(xmm5, qword[BO - 0x68]);
653 addps(xmm14, xmm7);
654 movaps(xmm7, xmm6);
655 mulps(xmm6, xmm2);
656 movups(xmm2, xword[AO - 0x20]);
657 mulps(xmm7, xmm3);
658 movups(xmm3, xword[AO - 0x10]);
659 addps(xmm11, xmm6);
660 addps(xmm15, xmm7);
661 pshufd(xmm6, xmm4, 0xb1);
662 prefetcht0(byte[AO + 0x1c0]);
663 movaps(xmm7, xmm4);
664 mulps(xmm4, xmm0);
665 mulps(xmm7, xmm1);
666 addps(xmm8, xmm4);
667 addps(xmm12, xmm7);
668 pshufd(xmm4, xmm6, 0x1b);
669 movaps(xmm7, xmm6);
670 mulps(xmm6, xmm0);
671 mulps(xmm7, xmm1);
672 addps(xmm9, xmm6);
673 addps(xmm13, xmm7);
674 pshufd(xmm6, xmm4, 0xb1);
675 movaps(xmm7, xmm4);
676 mulps(xmm4, xmm0);
677 mulps(xmm7, xmm1);
678 addps(xmm10, xmm4);
679 movddup(xmm4, qword[BO - 0x60]);
680 addps(xmm14, xmm7);
681 movaps(xmm7, xmm6);
682 mulps(xmm6, xmm0);
683 movups(xmm0, xword[AO]);
684 mulps(xmm7, xmm1);
685 movups(xmm1, xword[AO + 0x10]);
686 addps(xmm11, xmm6);
687 addps(xmm15, xmm7);
688 pshufd(xmm6, xmm5, 0xb1);
689 movaps(xmm7, xmm5);
690 mulps(xmm5, xmm2);
691 mulps(xmm7, xmm3);
692 addps(xmm8, xmm5);
693 addps(xmm12, xmm7);
694 pshufd(xmm5, xmm6, 0x1b);
695 movaps(xmm7, xmm6);
696 mulps(xmm6, xmm2);
697 mulps(xmm7, xmm3);
698 addps(xmm9, xmm6);
699 addps(xmm13, xmm7);
700 pshufd(xmm6, xmm5, 0xb1);
701 movaps(xmm7, xmm5);
702 mulps(xmm5, xmm2);
703 mulps(xmm7, xmm3);
704 addps(xmm10, xmm5);
705 movddup(xmm5, qword[BO - 0x58]);
706 addps(xmm14, xmm7);
707 add(AA, 0x8);
708 sub(BO, -32);
709 movaps(xmm7, xmm6);
710 mulps(xmm6, xmm2);
711 movups(xmm2, xword[AO + 0x20]);
712 mulps(xmm7, xmm3);
713 movups(xmm3, xword[AO + 0x30]);
714 sub(AO, -128);
715 addps(xmm11, xmm6);
716 addps(xmm15, xmm7);
717 prefetcht0(byte[AA - 0x78]);
718 sub(H, 0x1);
719 jg(labels[77], T_NEAR);
720 align(4);
721
722 L(labels[78]);
723 mov(H, K);
724 and_(H, 0x3);
725 je(labels[80], T_NEAR);
726 align(4);
727
728 L(labels[79]);
729 pshufd(xmm6, xmm4, 0xb1);
730 movaps(xmm7, xmm4);
731 mulps(xmm4, xmm0);
732 mulps(xmm7, xmm1);
733 addps(xmm8, xmm4);
734 addps(xmm12, xmm7);
735 pshufd(xmm4, xmm6, 0x1b);
736 movaps(xmm7, xmm6);
737 mulps(xmm6, xmm0);
738 mulps(xmm7, xmm1);
739 addps(xmm9, xmm6);
740 addps(xmm13, xmm7);
741 pshufd(xmm6, xmm4, 0xb1);
742 movaps(xmm7, xmm4);
743 mulps(xmm4, xmm0);
744 mulps(xmm7, xmm1);
745 addps(xmm10, xmm4);
746 movddup(xmm4, qword[BO - 0x78]);
747 addps(xmm14, xmm7);
748 movaps(xmm7, xmm6);
749 mulps(xmm6, xmm0);
750 movups(xmm0, xword[AO - 0x60]);
751 mulps(xmm7, xmm1);
752 movups(xmm1, xword[AO - 0x50]);
753 addps(xmm11, xmm6);
754 addps(xmm15, xmm7);
755 sub(AO, -32);
756 sub(BO, -8);
757 dec(H);
758 jg(labels[79], T_NEAR);
759 align(4);
760
761 L(labels[80]);
762 movaps(xmm0, xmm8);
763 unpcklpd(xmm8, xmm9);
764 unpckhpd(xmm0, xmm9);
765 movaps(xmm1, xmm10);
766 unpckhpd(xmm10, xmm11);
767 unpcklpd(xmm1, xmm11);
768 movaps(xmm9, xmm8);
769 shufps(xmm8, xmm10, 0xcc);
770 shufps(xmm9, xmm10, 0x66);
771 movaps(xmm0, xmm12);
772 unpcklpd(xmm12, xmm13);
773 unpckhpd(xmm0, xmm13);
774 movaps(xmm1, xmm14);
775 unpckhpd(xmm14, xmm15);
776 unpcklpd(xmm1, xmm15);
777 movaps(xmm13, xmm12);
778 shufps(xmm12, xmm14, 0xcc);
779 shufps(xmm13, xmm14, 0x66);
780 movups(xword[CO1 + 0x0], xmm8);
781 movups(xword[CO1 + 0x10], xmm12);
782 movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
783 movups(xword[CO1 + LDC * 1 + 0x10], xmm13);
784 lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
785 lea(CO2, ptr[CO2 + LDC * 2]);
786 align(4);
787
788 L(labels[81]);
789 test(I, 0x1);
790 jle(labels[89], T_NEAR);
791 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
792 movups(xmm0, xword[A - 0x80]);
793 xorps(xmm8, xmm8);
794 movups(xmm1, xword[A - 0x70]);
795 xorps(xmm9, xmm9);
796 movups(xmm2, xword[A - 0x60]);
797 xorps(xmm10, xmm10);
798 movups(xmm3, xword[A - 0x50]);
799 xorps(xmm11, xmm11);
800 movss(xmm4, dword[BO - 0x80]);
801 xorps(xmm12, xmm12);
802 movss(xmm5, dword[BO - 0x7c]);
803 xorps(xmm13, xmm13);
804 xorps(xmm14, xmm14);
805 xorps(xmm15, xmm15);
806 mov(AO, A);
807 mov(H, K);
808 sar(H, 0x2);
809 jle(labels[86], T_NEAR);
810 sub(H, 0x1e);
811 jle(labels[83], T_NEAR);
812 align(4);
813
814 L(labels[82]);
815 prefetcht0(byte[AO + 0x180]);
816 prefetcht0(byte[BO + 0x100]);
817 pshufd(xmm6, xmm4, 0xb1);
818 movaps(xmm7, xmm4);
819 mulps(xmm4, xmm0);
820 mulps(xmm7, xmm1);
821 addps(xmm8, xmm4);
822 addps(xmm12, xmm7);
823 pshufd(xmm4, xmm6, 0x1b);
824 movaps(xmm7, xmm6);
825 mulps(xmm6, xmm0);
826 mulps(xmm7, xmm1);
827 addps(xmm9, xmm6);
828 addps(xmm13, xmm7);
829 pshufd(xmm6, xmm4, 0xb1);
830 movaps(xmm7, xmm4);
831 mulps(xmm4, xmm0);
832 mulps(xmm7, xmm1);
833 addps(xmm10, xmm4);
834 movss(xmm4, dword[BO - 0x78]);
835 addps(xmm14, xmm7);
836 movaps(xmm7, xmm6);
837 mulps(xmm6, xmm0);
838 movups(xmm0, xword[AO - 0x40]);
839 mulps(xmm7, xmm1);
840 movups(xmm1, xword[AO - 0x30]);
841 addps(xmm11, xmm6);
842 addps(xmm15, xmm7);
843 pshufd(xmm6, xmm5, 0xb1);
844 movaps(xmm7, xmm5);
845 mulps(xmm5, xmm2);
846 mulps(xmm7, xmm3);
847 addps(xmm8, xmm5);
848 addps(xmm12, xmm7);
849 pshufd(xmm5, xmm6, 0x1b);
850 movaps(xmm7, xmm6);
851 mulps(xmm6, xmm2);
852 mulps(xmm7, xmm3);
853 addps(xmm9, xmm6);
854 addps(xmm13, xmm7);
855 pshufd(xmm6, xmm5, 0xb1);
856 movaps(xmm7, xmm5);
857 mulps(xmm5, xmm2);
858 mulps(xmm7, xmm3);
859 addps(xmm10, xmm5);
860 movss(xmm5, dword[BO - 0x74]);
861 addps(xmm14, xmm7);
862 movaps(xmm7, xmm6);
863 mulps(xmm6, xmm2);
864 movups(xmm2, xword[AO - 0x20]);
865 mulps(xmm7, xmm3);
866 movups(xmm3, xword[AO - 0x10]);
867 addps(xmm11, xmm6);
868 addps(xmm15, xmm7);
869 pshufd(xmm6, xmm4, 0xb1);
870 prefetcht0(byte[AO + 0x1c0]);
871 movaps(xmm7, xmm4);
872 mulps(xmm4, xmm0);
873 mulps(xmm7, xmm1);
874 addps(xmm8, xmm4);
875 addps(xmm12, xmm7);
876 pshufd(xmm4, xmm6, 0x1b);
877 movaps(xmm7, xmm6);
878 mulps(xmm6, xmm0);
879 mulps(xmm7, xmm1);
880 addps(xmm9, xmm6);
881 addps(xmm13, xmm7);
882 pshufd(xmm6, xmm4, 0xb1);
883 movaps(xmm7, xmm4);
884 mulps(xmm4, xmm0);
885 mulps(xmm7, xmm1);
886 addps(xmm10, xmm4);
887 movss(xmm4, dword[BO - 0x70]);
888 addps(xmm14, xmm7);
889 movaps(xmm7, xmm6);
890 mulps(xmm6, xmm0);
891 movups(xmm0, xword[AO]);
892 mulps(xmm7, xmm1);
893 movups(xmm1, xword[AO + 0x10]);
894 addps(xmm11, xmm6);
895 addps(xmm15, xmm7);
896 pshufd(xmm6, xmm5, 0xb1);
897 movaps(xmm7, xmm5);
898 mulps(xmm5, xmm2);
899 mulps(xmm7, xmm3);
900 addps(xmm8, xmm5);
901 addps(xmm12, xmm7);
902 pshufd(xmm5, xmm6, 0x1b);
903 movaps(xmm7, xmm6);
904 mulps(xmm6, xmm2);
905 mulps(xmm7, xmm3);
906 addps(xmm9, xmm6);
907 addps(xmm13, xmm7);
908 pshufd(xmm6, xmm5, 0xb1);
909 movaps(xmm7, xmm5);
910 mulps(xmm5, xmm2);
911 mulps(xmm7, xmm3);
912 addps(xmm10, xmm5);
913 movss(xmm5, dword[BO - 0x6c]);
914 addps(xmm14, xmm7);
915 add(AA, 0x8);
916 sub(BO, -16);
917 movaps(xmm7, xmm6);
918 mulps(xmm6, xmm2);
919 movups(xmm2, xword[AO + 0x20]);
920 mulps(xmm7, xmm3);
921 movups(xmm3, xword[AO + 0x30]);
922 sub(AO, -128);
923 addps(xmm11, xmm6);
924 addps(xmm15, xmm7);
925 prefetcht0(byte[AA - 0x78]);
926 sub(H, 0x1);
927 jg(labels[82], T_NEAR);
928 align(4);
929
930 L(labels[83]);
931 prefetcht0(byte[CO1 + 0x1c]);
932 add(H, 0x1e);
933 align(4);
934
935 L(labels[84]);
936 prefetcht0(byte[AO + 0x180]);
937 prefetcht0(byte[BO + 0x100]);
938 pshufd(xmm6, xmm4, 0xb1);
939 movaps(xmm7, xmm4);
940 mulps(xmm4, xmm0);
941 mulps(xmm7, xmm1);
942 addps(xmm8, xmm4);
943 addps(xmm12, xmm7);
944 pshufd(xmm4, xmm6, 0x1b);
945 movaps(xmm7, xmm6);
946 mulps(xmm6, xmm0);
947 mulps(xmm7, xmm1);
948 addps(xmm9, xmm6);
949 addps(xmm13, xmm7);
950 pshufd(xmm6, xmm4, 0xb1);
951 movaps(xmm7, xmm4);
952 mulps(xmm4, xmm0);
953 mulps(xmm7, xmm1);
954 addps(xmm10, xmm4);
955 movss(xmm4, dword[BO - 0x78]);
956 addps(xmm14, xmm7);
957 movaps(xmm7, xmm6);
958 mulps(xmm6, xmm0);
959 movups(xmm0, xword[AO - 0x40]);
960 mulps(xmm7, xmm1);
961 movups(xmm1, xword[AO - 0x30]);
962 addps(xmm11, xmm6);
963 addps(xmm15, xmm7);
964 pshufd(xmm6, xmm5, 0xb1);
965 movaps(xmm7, xmm5);
966 mulps(xmm5, xmm2);
967 mulps(xmm7, xmm3);
968 addps(xmm8, xmm5);
969 addps(xmm12, xmm7);
970 pshufd(xmm5, xmm6, 0x1b);
971 movaps(xmm7, xmm6);
972 mulps(xmm6, xmm2);
973 mulps(xmm7, xmm3);
974 addps(xmm9, xmm6);
975 addps(xmm13, xmm7);
976 pshufd(xmm6, xmm5, 0xb1);
977 movaps(xmm7, xmm5);
978 mulps(xmm5, xmm2);
979 mulps(xmm7, xmm3);
980 addps(xmm10, xmm5);
981 movss(xmm5, dword[BO - 0x74]);
982 addps(xmm14, xmm7);
983 movaps(xmm7, xmm6);
984 mulps(xmm6, xmm2);
985 movups(xmm2, xword[AO - 0x20]);
986 mulps(xmm7, xmm3);
987 movups(xmm3, xword[AO - 0x10]);
988 addps(xmm11, xmm6);
989 addps(xmm15, xmm7);
990 pshufd(xmm6, xmm4, 0xb1);
991 prefetcht0(byte[AO + 0x1c0]);
992 movaps(xmm7, xmm4);
993 mulps(xmm4, xmm0);
994 mulps(xmm7, xmm1);
995 addps(xmm8, xmm4);
996 addps(xmm12, xmm7);
997 pshufd(xmm4, xmm6, 0x1b);
998 movaps(xmm7, xmm6);
999 mulps(xmm6, xmm0);
1000 mulps(xmm7, xmm1);
1001 addps(xmm9, xmm6);
1002 addps(xmm13, xmm7);
1003 pshufd(xmm6, xmm4, 0xb1);
1004 movaps(xmm7, xmm4);
1005 mulps(xmm4, xmm0);
1006 mulps(xmm7, xmm1);
1007 addps(xmm10, xmm4);
1008 movss(xmm4, dword[BO - 0x70]);
1009 addps(xmm14, xmm7);
1010 movaps(xmm7, xmm6);
1011 mulps(xmm6, xmm0);
1012 movups(xmm0, xword[AO]);
1013 mulps(xmm7, xmm1);
1014 movups(xmm1, xword[AO + 0x10]);
1015 addps(xmm11, xmm6);
1016 addps(xmm15, xmm7);
1017 pshufd(xmm6, xmm5, 0xb1);
1018 movaps(xmm7, xmm5);
1019 mulps(xmm5, xmm2);
1020 mulps(xmm7, xmm3);
1021 addps(xmm8, xmm5);
1022 addps(xmm12, xmm7);
1023 pshufd(xmm5, xmm6, 0x1b);
1024 movaps(xmm7, xmm6);
1025 mulps(xmm6, xmm2);
1026 mulps(xmm7, xmm3);
1027 addps(xmm9, xmm6);
1028 addps(xmm13, xmm7);
1029 pshufd(xmm6, xmm5, 0xb1);
1030 movaps(xmm7, xmm5);
1031 mulps(xmm5, xmm2);
1032 mulps(xmm7, xmm3);
1033 addps(xmm10, xmm5);
1034 movss(xmm5, dword[BO - 0x6c]);
1035 addps(xmm14, xmm7);
1036 add(AA, 0x8);
1037 sub(BO, -16);
1038 movaps(xmm7, xmm6);
1039 mulps(xmm6, xmm2);
1040 movups(xmm2, xword[AO + 0x20]);
1041 mulps(xmm7, xmm3);
1042 movups(xmm3, xword[AO + 0x30]);
1043 sub(AO, -128);
1044 addps(xmm11, xmm6);
1045 addps(xmm15, xmm7);
1046 prefetcht0(byte[AA - 0x78]);
1047 sub(H, 0x1);
1048 jg(labels[84], T_NEAR);
1049 align(4);
1050
1051 L(labels[86]);
1052 mov(H, K);
1053 and_(H, 0x3);
1054 je(labels[88], T_NEAR);
1055 align(4);
1056
1057 L(labels[87]);
1058 pshufd(xmm6, xmm4, 0xb1);
1059 movaps(xmm7, xmm4);
1060 mulps(xmm4, xmm0);
1061 mulps(xmm7, xmm1);
1062 addps(xmm8, xmm4);
1063 addps(xmm12, xmm7);
1064 pshufd(xmm4, xmm6, 0x1b);
1065 movaps(xmm7, xmm6);
1066 mulps(xmm6, xmm0);
1067 mulps(xmm7, xmm1);
1068 addps(xmm9, xmm6);
1069 addps(xmm13, xmm7);
1070 pshufd(xmm6, xmm4, 0xb1);
1071 movaps(xmm7, xmm4);
1072 mulps(xmm4, xmm0);
1073 mulps(xmm7, xmm1);
1074 addps(xmm10, xmm4);
1075 movss(xmm4, dword[BO - 0x7c]);
1076 addps(xmm14, xmm7);
1077 movaps(xmm7, xmm6);
1078 mulps(xmm6, xmm0);
1079 movups(xmm0, xword[AO - 0x60]);
1080 mulps(xmm7, xmm1);
1081 movups(xmm1, xword[AO - 0x50]);
1082 addps(xmm11, xmm6);
1083 addps(xmm15, xmm7);
1084 sub(AO, -32);
1085 sub(BO, -4);
1086 dec(H);
1087 jg(labels[87], T_NEAR);
1088 align(4);
1089
1090 L(labels[88]);
1091 movaps(xmm0, xmm8);
1092 unpcklpd(xmm8, xmm9);
1093 unpckhpd(xmm0, xmm9);
1094 movaps(xmm1, xmm10);
1095 unpckhpd(xmm10, xmm11);
1096 unpcklpd(xmm1, xmm11);
1097 movaps(xmm9, xmm8);
1098 shufps(xmm8, xmm10, 0xcc);
1099 shufps(xmm9, xmm10, 0x66);
1100 movaps(xmm0, xmm12);
1101 unpcklpd(xmm12, xmm13);
1102 unpckhpd(xmm0, xmm13);
1103 movaps(xmm1, xmm14);
1104 unpckhpd(xmm14, xmm15);
1105 unpcklpd(xmm1, xmm15);
1106 movaps(xmm13, xmm12);
1107 shufps(xmm12, xmm14, 0xcc);
1108 shufps(xmm13, xmm14, 0x66);
1109 movups(xword[CO1 + 0x0], xmm8);
1110 movups(xword[CO1 + 0x10], xmm12);
1111 lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
1112 lea(CO2, ptr[CO2 + LDC * 1]);
1113 align(4);
1114
1115 L(labels[89]);
1116 mov(A, AO);
1117 sub(J, 0x8);
1118 cmp(J, 0x8);
1119 jge(labels[72], T_NEAR);
1120 align(4);
1121
1122 L(labels[90]);
1123 test(J, 0x4);
1124 jle(labels[20], T_NEAR);
1125 mov(AA, K);
1126 imul(AA, AA, 0x10);
1127 add(AA, A);
1128 mov(CO1, C);
1129 add(C, 0x10);
1130 mov(BO, B);
1131 mov(I, N);
1132 cmp(I, 0x4);
1133 jl(labels[5], T_NEAR);
1134 align(4);
1135
1136 L(labels[91]);
1137 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
1138 movups(xmm0, xword[A - 0x80]);
1139 xorps(xmm8, xmm8);
1140 xorps(xmm9, xmm9);
1141 movups(xmm2, xword[A - 0x70]);
1142 xorps(xmm10, xmm10);
1143 xorps(xmm11, xmm11);
1144 movaps(xmm4, xword[BO - 0x80]);
1145 xorps(xmm12, xmm12);
1146 movaps(xmm5, xword[BO - 0x70]);
1147 xorps(xmm13, xmm13);
1148 xorps(xmm14, xmm14);
1149 xorps(xmm15, xmm15);
1150 mov(AO, A);
1151 mov(H, K);
1152 sar(H, 0x2);
1153 jle(labels[2], T_NEAR);
1154 sub(H, 0x1e);
1155 jle(labels[0], T_NEAR);
1156 align(4);
1157
1158 L(labels[92]);
1159 prefetcht0(byte[AO + 0x180]);
1160 prefetcht0(byte[BO + 0x100]);
1161 pshufd(xmm6, xmm4, 0xb1);
1162 mulps(xmm4, xmm0);
1163 addps(xmm8, xmm4);
1164 pshufd(xmm4, xmm6, 0x1b);
1165 mulps(xmm6, xmm0);
1166 addps(xmm9, xmm6);
1167 pshufd(xmm6, xmm4, 0xb1);
1168 mulps(xmm4, xmm0);
1169 addps(xmm10, xmm4);
1170 movaps(xmm4, xword[BO - 0x60]);
1171 mulps(xmm6, xmm0);
1172 movups(xmm0, xword[AO - 0x60]);
1173 addps(xmm11, xmm6);
1174 pshufd(xmm6, xmm5, 0xb1);
1175 mulps(xmm5, xmm2);
1176 addps(xmm8, xmm5);
1177 pshufd(xmm5, xmm6, 0x1b);
1178 mulps(xmm6, xmm2);
1179 addps(xmm9, xmm6);
1180 pshufd(xmm6, xmm5, 0xb1);
1181 mulps(xmm5, xmm2);
1182 addps(xmm10, xmm5);
1183 movaps(xmm5, xword[BO - 0x50]);
1184 mulps(xmm6, xmm2);
1185 movups(xmm2, xword[AO - 0x50]);
1186 addps(xmm11, xmm6);
1187 pshufd(xmm6, xmm4, 0xb1);
1188 mulps(xmm4, xmm0);
1189 addps(xmm8, xmm4);
1190 pshufd(xmm4, xmm6, 0x1b);
1191 mulps(xmm6, xmm0);
1192 addps(xmm9, xmm6);
1193 pshufd(xmm6, xmm4, 0xb1);
1194 mulps(xmm4, xmm0);
1195 addps(xmm10, xmm4);
1196 movaps(xmm4, xword[BO - 0x40]);
1197 mulps(xmm6, xmm0);
1198 movups(xmm0, xword[AO - 0x40]);
1199 addps(xmm11, xmm6);
1200 pshufd(xmm6, xmm5, 0xb1);
1201 mulps(xmm5, xmm2);
1202 addps(xmm8, xmm5);
1203 pshufd(xmm5, xmm6, 0x1b);
1204 mulps(xmm6, xmm2);
1205 addps(xmm9, xmm6);
1206 pshufd(xmm6, xmm5, 0xb1);
1207 mulps(xmm5, xmm2);
1208 addps(xmm10, xmm5);
1209 movaps(xmm5, xword[BO - 0x30]);
1210 add(AA, 0x8);
1211 sub(BO, -64);
1212 mulps(xmm6, xmm2);
1213 movups(xmm2, xword[AO - 0x30]);
1214 sub(AO, -64);
1215 addps(xmm11, xmm6);
1216 prefetcht0(byte[AA - 0x78]);
1217 sub(H, 0x1);
1218 jg(labels[92], T_NEAR);
1219 align(4);
1220
1221 L(labels[0]);
1222 prefetcht0(byte[CO1 + 0xc]);
1223 prefetcht0(byte[CO1 + LDC * 1 + 0xc]);
1224 prefetcht0(byte[CO2 + 0xc]);
1225 prefetcht0(byte[CO2 + LDC * 1 + 0xc]);
1226 add(H, 0x1e);
1227 align(4);
1228
1229 L(labels[1]);
1230 prefetcht0(byte[AO + 0x180]);
1231 prefetcht0(byte[BO + 0x100]);
1232 pshufd(xmm6, xmm4, 0xb1);
1233 mulps(xmm4, xmm0);
1234 addps(xmm8, xmm4);
1235 pshufd(xmm4, xmm6, 0x1b);
1236 mulps(xmm6, xmm0);
1237 addps(xmm9, xmm6);
1238 pshufd(xmm6, xmm4, 0xb1);
1239 mulps(xmm4, xmm0);
1240 addps(xmm10, xmm4);
1241 movaps(xmm4, xword[BO - 0x60]);
1242 mulps(xmm6, xmm0);
1243 movups(xmm0, xword[AO - 0x60]);
1244 addps(xmm11, xmm6);
1245 pshufd(xmm6, xmm5, 0xb1);
1246 mulps(xmm5, xmm2);
1247 addps(xmm8, xmm5);
1248 pshufd(xmm5, xmm6, 0x1b);
1249 mulps(xmm6, xmm2);
1250 addps(xmm9, xmm6);
1251 pshufd(xmm6, xmm5, 0xb1);
1252 mulps(xmm5, xmm2);
1253 addps(xmm10, xmm5);
1254 movaps(xmm5, xword[BO - 0x50]);
1255 mulps(xmm6, xmm2);
1256 movups(xmm2, xword[AO - 0x50]);
1257 addps(xmm11, xmm6);
1258 pshufd(xmm6, xmm4, 0xb1);
1259 mulps(xmm4, xmm0);
1260 addps(xmm8, xmm4);
1261 pshufd(xmm4, xmm6, 0x1b);
1262 mulps(xmm6, xmm0);
1263 addps(xmm9, xmm6);
1264 pshufd(xmm6, xmm4, 0xb1);
1265 mulps(xmm4, xmm0);
1266 addps(xmm10, xmm4);
1267 movaps(xmm4, xword[BO - 0x40]);
1268 mulps(xmm6, xmm0);
1269 movups(xmm0, xword[AO - 0x40]);
1270 addps(xmm11, xmm6);
1271 pshufd(xmm6, xmm5, 0xb1);
1272 mulps(xmm5, xmm2);
1273 addps(xmm8, xmm5);
1274 pshufd(xmm5, xmm6, 0x1b);
1275 mulps(xmm6, xmm2);
1276 addps(xmm9, xmm6);
1277 pshufd(xmm6, xmm5, 0xb1);
1278 mulps(xmm5, xmm2);
1279 addps(xmm10, xmm5);
1280 movaps(xmm5, xword[BO - 0x30]);
1281 add(AA, 0x8);
1282 sub(BO, -64);
1283 mulps(xmm6, xmm2);
1284 movups(xmm2, xword[AO - 0x30]);
1285 sub(AO, -64);
1286 addps(xmm11, xmm6);
1287 prefetcht0(byte[AA - 0x78]);
1288 sub(H, 0x1);
1289 jg(labels[1], T_NEAR);
1290 align(4);
1291
1292 L(labels[2]);
1293 mov(H, K);
1294 and_(H, 0x3);
1295 je(labels[4], T_NEAR);
1296 align(4);
1297
1298 L(labels[3]);
1299 pshufd(xmm6, xmm4, 0xb1);
1300 movaps(xmm7, xmm4);
1301 mulps(xmm4, xmm0);
1302 addps(xmm8, xmm4);
1303 pshufd(xmm4, xmm6, 0x1b);
1304 movaps(xmm7, xmm6);
1305 mulps(xmm6, xmm0);
1306 addps(xmm9, xmm6);
1307 pshufd(xmm6, xmm4, 0xb1);
1308 movaps(xmm7, xmm4);
1309 mulps(xmm4, xmm0);
1310 addps(xmm10, xmm4);
1311 movaps(xmm4, xword[BO - 0x70]);
1312 movaps(xmm7, xmm6);
1313 mulps(xmm6, xmm0);
1314 movups(xmm0, xword[AO - 0x70]);
1315 addps(xmm11, xmm6);
1316 sub(AO, -16);
1317 sub(BO, -16);
1318 dec(H);
1319 jg(labels[3], T_NEAR);
1320 align(4);
1321
1322 L(labels[4]);
1323 movaps(xmm0, xmm8);
1324 unpcklpd(xmm8, xmm9);
1325 unpckhpd(xmm0, xmm9);
1326 movaps(xmm1, xmm10);
1327 unpckhpd(xmm10, xmm11);
1328 unpcklpd(xmm1, xmm11);
1329 movaps(xmm9, xmm8);
1330 shufps(xmm8, xmm10, 0xcc);
1331 shufps(xmm9, xmm10, 0x66);
1332 movaps(xmm10, xmm1);
1333 movaps(xmm11, xmm1);
1334 shufps(xmm10, xmm0, 0xcc);
1335 shufps(xmm11, xmm0, 0x66);
1336 movups(xword[CO1 + 0x0], xmm8);
1337 movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
1338 movups(xword[CO2], xmm10);
1339 movups(xword[CO2 + LDC * 1], xmm11);
1340 lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
1341 lea(CO2, ptr[CO2 + LDC * 4]);
1342 sub(I, 0x4);
1343 cmp(I, 0x4);
1344 jge(labels[91], T_NEAR);
1345 align(4);
1346
1347 L(labels[5]);
1348 test(I, 0x2);
1349 jle(labels[12], T_NEAR);
1350 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
1351 movups(xmm0, xword[A - 0x80]);
1352 xorps(xmm8, xmm8);
1353 xorps(xmm9, xmm9);
1354 movups(xmm2, xword[A - 0x70]);
1355 xorps(xmm10, xmm10);
1356 xorps(xmm11, xmm11);
1357 movddup(xmm4, qword[BO - 0x80]);
1358 xorps(xmm12, xmm12);
1359 movddup(xmm5, qword[BO - 0x78]);
1360 xorps(xmm13, xmm13);
1361 xorps(xmm14, xmm14);
1362 xorps(xmm15, xmm15);
1363 mov(AO, A);
1364 mov(H, K);
1365 sar(H, 0x2);
1366 jle(labels[9], T_NEAR);
1367 sub(H, 0x1e);
1368 jle(labels[7], T_NEAR);
1369 align(4);
1370
1371 L(labels[6]);
1372 prefetcht0(byte[AO + 0x180]);
1373 prefetcht0(byte[BO + 0x100]);
1374 pshufd(xmm6, xmm4, 0xb1);
1375 mulps(xmm4, xmm0);
1376 addps(xmm8, xmm4);
1377 pshufd(xmm4, xmm6, 0x1b);
1378 mulps(xmm6, xmm0);
1379 addps(xmm9, xmm6);
1380 pshufd(xmm6, xmm4, 0xb1);
1381 mulps(xmm4, xmm0);
1382 addps(xmm10, xmm4);
1383 movddup(xmm4, qword[BO - 0x70]);
1384 mulps(xmm6, xmm0);
1385 movups(xmm0, xword[AO - 0x60]);
1386 addps(xmm11, xmm6);
1387 pshufd(xmm6, xmm5, 0xb1);
1388 mulps(xmm5, xmm2);
1389 addps(xmm8, xmm5);
1390 pshufd(xmm5, xmm6, 0x1b);
1391 mulps(xmm6, xmm2);
1392 addps(xmm9, xmm6);
1393 pshufd(xmm6, xmm5, 0xb1);
1394 mulps(xmm5, xmm2);
1395 addps(xmm10, xmm5);
1396 movddup(xmm5, qword[BO - 0x68]);
1397 mulps(xmm6, xmm2);
1398 movups(xmm2, xword[AO - 0x50]);
1399 addps(xmm11, xmm6);
1400 pshufd(xmm6, xmm4, 0xb1);
1401 mulps(xmm4, xmm0);
1402 addps(xmm8, xmm4);
1403 pshufd(xmm4, xmm6, 0x1b);
1404 mulps(xmm6, xmm0);
1405 addps(xmm9, xmm6);
1406 pshufd(xmm6, xmm4, 0xb1);
1407 mulps(xmm4, xmm0);
1408 addps(xmm10, xmm4);
1409 movddup(xmm4, qword[BO - 0x60]);
1410 mulps(xmm6, xmm0);
1411 movups(xmm0, xword[AO - 0x40]);
1412 addps(xmm11, xmm6);
1413 pshufd(xmm6, xmm5, 0xb1);
1414 mulps(xmm5, xmm2);
1415 addps(xmm8, xmm5);
1416 pshufd(xmm5, xmm6, 0x1b);
1417 mulps(xmm6, xmm2);
1418 addps(xmm9, xmm6);
1419 pshufd(xmm6, xmm5, 0xb1);
1420 mulps(xmm5, xmm2);
1421 addps(xmm10, xmm5);
1422 movddup(xmm5, qword[BO - 0x58]);
1423 add(AA, 0x8);
1424 sub(BO, -32);
1425 mulps(xmm6, xmm2);
1426 movups(xmm2, xword[AO - 0x30]);
1427 sub(AO, -64);
1428 addps(xmm11, xmm6);
1429 prefetcht0(byte[AA - 0x78]);
1430 sub(H, 0x1);
1431 jg(labels[6], T_NEAR);
1432 align(4);
1433
1434 L(labels[7]);
1435 prefetcht0(byte[CO1 + 0xc]);
1436 prefetcht0(byte[CO1 + LDC * 1 + 0xc]);
1437 add(H, 0x1e);
1438 align(4);
1439
1440 L(labels[8]);
1441 prefetcht0(byte[AO + 0x180]);
1442 prefetcht0(byte[BO + 0x100]);
1443 pshufd(xmm6, xmm4, 0xb1);
1444 mulps(xmm4, xmm0);
1445 addps(xmm8, xmm4);
1446 pshufd(xmm4, xmm6, 0x1b);
1447 mulps(xmm6, xmm0);
1448 addps(xmm9, xmm6);
1449 pshufd(xmm6, xmm4, 0xb1);
1450 mulps(xmm4, xmm0);
1451 addps(xmm10, xmm4);
1452 movddup(xmm4, qword[BO - 0x70]);
1453 mulps(xmm6, xmm0);
1454 movups(xmm0, xword[AO - 0x60]);
1455 addps(xmm11, xmm6);
1456 pshufd(xmm6, xmm5, 0xb1);
1457 mulps(xmm5, xmm2);
1458 addps(xmm8, xmm5);
1459 pshufd(xmm5, xmm6, 0x1b);
1460 mulps(xmm6, xmm2);
1461 addps(xmm9, xmm6);
1462 pshufd(xmm6, xmm5, 0xb1);
1463 mulps(xmm5, xmm2);
1464 addps(xmm10, xmm5);
1465 movddup(xmm5, qword[BO - 0x68]);
1466 mulps(xmm6, xmm2);
1467 movups(xmm2, xword[AO - 0x50]);
1468 addps(xmm11, xmm6);
1469 pshufd(xmm6, xmm4, 0xb1);
1470 mulps(xmm4, xmm0);
1471 addps(xmm8, xmm4);
1472 pshufd(xmm4, xmm6, 0x1b);
1473 mulps(xmm6, xmm0);
1474 addps(xmm9, xmm6);
1475 pshufd(xmm6, xmm4, 0xb1);
1476 mulps(xmm4, xmm0);
1477 addps(xmm10, xmm4);
1478 movddup(xmm4, qword[BO - 0x60]);
1479 mulps(xmm6, xmm0);
1480 movups(xmm0, xword[AO - 0x40]);
1481 addps(xmm11, xmm6);
1482 pshufd(xmm6, xmm5, 0xb1);
1483 mulps(xmm5, xmm2);
1484 addps(xmm8, xmm5);
1485 pshufd(xmm5, xmm6, 0x1b);
1486 mulps(xmm6, xmm2);
1487 addps(xmm9, xmm6);
1488 pshufd(xmm6, xmm5, 0xb1);
1489 mulps(xmm5, xmm2);
1490 addps(xmm10, xmm5);
1491 movddup(xmm5, qword[BO - 0x58]);
1492 add(AA, 0x8);
1493 sub(BO, -32);
1494 mulps(xmm6, xmm2);
1495 movups(xmm2, xword[AO - 0x30]);
1496 sub(AO, -64);
1497 addps(xmm11, xmm6);
1498 prefetcht0(byte[AA - 0x78]);
1499 sub(H, 0x1);
1500 jg(labels[8], T_NEAR);
1501 align(4);
1502
1503 L(labels[9]);
1504 mov(H, K);
1505 and_(H, 0x3);
1506 je(labels[11], T_NEAR);
1507 align(4);
1508
1509 L(labels[10]);
1510 pshufd(xmm6, xmm4, 0xb1);
1511 movaps(xmm7, xmm4);
1512 mulps(xmm4, xmm0);
1513 addps(xmm8, xmm4);
1514 pshufd(xmm4, xmm6, 0x1b);
1515 movaps(xmm7, xmm6);
1516 mulps(xmm6, xmm0);
1517 addps(xmm9, xmm6);
1518 pshufd(xmm6, xmm4, 0xb1);
1519 movaps(xmm7, xmm4);
1520 mulps(xmm4, xmm0);
1521 addps(xmm10, xmm4);
1522 movddup(xmm4, qword[BO - 0x78]);
1523 movaps(xmm7, xmm6);
1524 mulps(xmm6, xmm0);
1525 movups(xmm0, xword[AO - 0x70]);
1526 addps(xmm11, xmm6);
1527 sub(AO, -16);
1528 sub(BO, -8);
1529 dec(H);
1530 jg(labels[10], T_NEAR);
1531 align(4);
1532
1533 L(labels[11]);
1534 movaps(xmm0, xmm8);
1535 unpcklpd(xmm8, xmm9);
1536 unpckhpd(xmm0, xmm9);
1537 movaps(xmm1, xmm10);
1538 unpckhpd(xmm10, xmm11);
1539 unpcklpd(xmm1, xmm11);
1540 movaps(xmm9, xmm8);
1541 shufps(xmm8, xmm10, 0xcc);
1542 shufps(xmm9, xmm10, 0x66);
1543 movups(xword[CO1 + 0x0], xmm8);
1544 movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
1545 lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
1546 lea(CO2, ptr[CO2 + LDC * 2]);
1547 align(4);
1548
1549 L(labels[12]);
1550 test(I, 0x1);
1551 jle(labels[19], T_NEAR);
1552 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
1553 movups(xmm0, xword[A - 0x80]);
1554 xorps(xmm8, xmm8);
1555 xorps(xmm9, xmm9);
1556 movups(xmm2, xword[A - 0x70]);
1557 xorps(xmm10, xmm10);
1558 xorps(xmm11, xmm11);
1559 movss(xmm4, dword[BO - 0x80]);
1560 xorps(xmm12, xmm12);
1561 movss(xmm5, dword[BO - 0x7c]);
1562 xorps(xmm13, xmm13);
1563 xorps(xmm14, xmm14);
1564 xorps(xmm15, xmm15);
1565 mov(AO, A);
1566 mov(H, K);
1567 sar(H, 0x2);
1568 jle(labels[16], T_NEAR);
1569 sub(H, 0x1e);
1570 jle(labels[14], T_NEAR);
1571 align(4);
1572
1573 L(labels[13]);
1574 prefetcht0(byte[AO + 0x180]);
1575 prefetcht0(byte[BO + 0x100]);
1576 pshufd(xmm6, xmm4, 0xb1);
1577 mulps(xmm4, xmm0);
1578 addps(xmm8, xmm4);
1579 pshufd(xmm4, xmm6, 0x1b);
1580 mulps(xmm6, xmm0);
1581 addps(xmm9, xmm6);
1582 pshufd(xmm6, xmm4, 0xb1);
1583 mulps(xmm4, xmm0);
1584 addps(xmm10, xmm4);
1585 movss(xmm4, dword[BO - 0x78]);
1586 mulps(xmm6, xmm0);
1587 movups(xmm0, xword[AO - 0x60]);
1588 addps(xmm11, xmm6);
1589 pshufd(xmm6, xmm5, 0xb1);
1590 mulps(xmm5, xmm2);
1591 addps(xmm8, xmm5);
1592 pshufd(xmm5, xmm6, 0x1b);
1593 mulps(xmm6, xmm2);
1594 addps(xmm9, xmm6);
1595 pshufd(xmm6, xmm5, 0xb1);
1596 mulps(xmm5, xmm2);
1597 addps(xmm10, xmm5);
1598 movss(xmm5, dword[BO - 0x74]);
1599 mulps(xmm6, xmm2);
1600 movups(xmm2, xword[AO - 0x50]);
1601 addps(xmm11, xmm6);
1602 pshufd(xmm6, xmm4, 0xb1);
1603 mulps(xmm4, xmm0);
1604 addps(xmm8, xmm4);
1605 pshufd(xmm4, xmm6, 0x1b);
1606 mulps(xmm6, xmm0);
1607 addps(xmm9, xmm6);
1608 pshufd(xmm6, xmm4, 0xb1);
1609 mulps(xmm4, xmm0);
1610 addps(xmm10, xmm4);
1611 movss(xmm4, dword[BO - 0x70]);
1612 mulps(xmm6, xmm0);
1613 movups(xmm0, xword[AO - 0x40]);
1614 addps(xmm11, xmm6);
1615 pshufd(xmm6, xmm5, 0xb1);
1616 mulps(xmm5, xmm2);
1617 addps(xmm8, xmm5);
1618 pshufd(xmm5, xmm6, 0x1b);
1619 mulps(xmm6, xmm2);
1620 addps(xmm9, xmm6);
1621 pshufd(xmm6, xmm5, 0xb1);
1622 mulps(xmm5, xmm2);
1623 addps(xmm10, xmm5);
1624 movss(xmm5, dword[BO - 0x6c]);
1625 add(AA, 0x8);
1626 sub(BO, -16);
1627 mulps(xmm6, xmm2);
1628 movups(xmm2, xword[AO - 0x30]);
1629 sub(AO, -64);
1630 addps(xmm11, xmm6);
1631 prefetcht0(byte[AA - 0x78]);
1632 sub(H, 0x1);
1633 jg(labels[13], T_NEAR);
1634 align(4);
1635
1636 L(labels[14]);
1637 prefetcht0(byte[CO1 + 0xc]);
1638 add(H, 0x1e);
1639 align(4);
1640
1641 L(labels[15]);
1642 prefetcht0(byte[AO + 0x180]);
1643 prefetcht0(byte[BO + 0x100]);
1644 pshufd(xmm6, xmm4, 0xb1);
1645 mulps(xmm4, xmm0);
1646 addps(xmm8, xmm4);
1647 pshufd(xmm4, xmm6, 0x1b);
1648 mulps(xmm6, xmm0);
1649 addps(xmm9, xmm6);
1650 pshufd(xmm6, xmm4, 0xb1);
1651 mulps(xmm4, xmm0);
1652 addps(xmm10, xmm4);
1653 movss(xmm4, dword[BO - 0x78]);
1654 mulps(xmm6, xmm0);
1655 movups(xmm0, xword[AO - 0x60]);
1656 addps(xmm11, xmm6);
1657 pshufd(xmm6, xmm5, 0xb1);
1658 mulps(xmm5, xmm2);
1659 addps(xmm8, xmm5);
1660 pshufd(xmm5, xmm6, 0x1b);
1661 mulps(xmm6, xmm2);
1662 addps(xmm9, xmm6);
1663 pshufd(xmm6, xmm5, 0xb1);
1664 mulps(xmm5, xmm2);
1665 addps(xmm10, xmm5);
1666 movss(xmm5, dword[BO - 0x74]);
1667 mulps(xmm6, xmm2);
1668 movups(xmm2, xword[AO - 0x50]);
1669 addps(xmm11, xmm6);
1670 pshufd(xmm6, xmm4, 0xb1);
1671 mulps(xmm4, xmm0);
1672 addps(xmm8, xmm4);
1673 pshufd(xmm4, xmm6, 0x1b);
1674 mulps(xmm6, xmm0);
1675 addps(xmm9, xmm6);
1676 pshufd(xmm6, xmm4, 0xb1);
1677 mulps(xmm4, xmm0);
1678 addps(xmm10, xmm4);
1679 movss(xmm4, dword[BO - 0x70]);
1680 mulps(xmm6, xmm0);
1681 movups(xmm0, xword[AO - 0x40]);
1682 addps(xmm11, xmm6);
1683 pshufd(xmm6, xmm5, 0xb1);
1684 mulps(xmm5, xmm2);
1685 addps(xmm8, xmm5);
1686 pshufd(xmm5, xmm6, 0x1b);
1687 mulps(xmm6, xmm2);
1688 addps(xmm9, xmm6);
1689 pshufd(xmm6, xmm5, 0xb1);
1690 mulps(xmm5, xmm2);
1691 addps(xmm10, xmm5);
1692 movss(xmm5, dword[BO - 0x6c]);
1693 add(AA, 0x8);
1694 sub(BO, -16);
1695 mulps(xmm6, xmm2);
1696 movups(xmm2, xword[AO - 0x30]);
1697 sub(AO, -64);
1698 addps(xmm11, xmm6);
1699 prefetcht0(byte[AA - 0x78]);
1700 sub(H, 0x1);
1701 jg(labels[15], T_NEAR);
1702 align(4);
1703
1704 L(labels[16]);
1705 mov(H, K);
1706 and_(H, 0x3);
1707 je(labels[18], T_NEAR);
1708 align(4);
1709
1710 L(labels[17]);
1711 pshufd(xmm6, xmm4, 0xb1);
1712 movaps(xmm7, xmm4);
1713 mulps(xmm4, xmm0);
1714 addps(xmm8, xmm4);
1715 pshufd(xmm4, xmm6, 0x1b);
1716 movaps(xmm7, xmm6);
1717 mulps(xmm6, xmm0);
1718 addps(xmm9, xmm6);
1719 pshufd(xmm6, xmm4, 0xb1);
1720 movaps(xmm7, xmm4);
1721 mulps(xmm4, xmm0);
1722 addps(xmm10, xmm4);
1723 movss(xmm4, dword[BO - 0x7c]);
1724 movaps(xmm7, xmm6);
1725 mulps(xmm6, xmm0);
1726 movups(xmm0, xword[AO - 0x70]);
1727 addps(xmm11, xmm6);
1728 sub(AO, -16);
1729 sub(BO, -4);
1730 dec(H);
1731 jg(labels[17], T_NEAR);
1732 align(4);
1733
1734 L(labels[18]);
1735 movaps(xmm0, xmm8);
1736 unpcklpd(xmm8, xmm9);
1737 unpckhpd(xmm0, xmm9);
1738 movaps(xmm1, xmm10);
1739 unpckhpd(xmm10, xmm11);
1740 unpcklpd(xmm1, xmm11);
1741 movaps(xmm9, xmm8);
1742 shufps(xmm8, xmm10, 0xcc);
1743 shufps(xmm9, xmm10, 0x66);
1744 movups(xword[CO1 + 0x0], xmm8);
1745 lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
1746 lea(CO2, ptr[CO2 + LDC * 1]);
1747 align(4);
1748
1749 L(labels[19]);
1750 mov(A, AO);
1751 align(4);
1752
1753 L(labels[20]);
1754 test(J, 0x2);
1755 jle(labels[43], T_NEAR);
1756 mov(AA, K);
1757 imul(AA, AA, 0x8);
1758 add(AA, A);
1759 mov(CO1, C);
1760 add(C, 0x8);
1761 mov(BO, B);
1762 mov(I, N);
1763 cmp(I, 0x4);
1764 jl(labels[28], T_NEAR);
1765 align(4);
1766
1767 L(labels[21]);
1768 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
1769 movsd(xmm0, qword[A - 0x80]);
1770 xorps(xmm8, xmm8);
1771 xorps(xmm9, xmm9);
1772 movsd(xmm2, qword[A - 0x78]);
1773 xorps(xmm10, xmm10);
1774 xorps(xmm11, xmm11);
1775 movaps(xmm4, xword[BO - 0x80]);
1776 xorps(xmm12, xmm12);
1777 movaps(xmm5, xword[BO - 0x70]);
1778 xorps(xmm13, xmm13);
1779 xorps(xmm14, xmm14);
1780 xorps(xmm15, xmm15);
1781 mov(AO, A);
1782 mov(H, K);
1783 sar(H, 0x2);
1784 jle(labels[25], T_NEAR);
1785 sub(H, 0x1e);
1786 jle(labels[23], T_NEAR);
1787 align(4);
1788
1789 L(labels[22]);
1790 prefetcht0(byte[AO + 0x180]);
1791 prefetcht0(byte[BO + 0x100]);
1792 pshufd(xmm6, xmm4, 0xb1);
1793 mulps(xmm4, xmm0);
1794 addps(xmm8, xmm4);
1795 pshufd(xmm4, xmm6, 0x1b);
1796 mulps(xmm6, xmm0);
1797 addps(xmm9, xmm6);
1798 pshufd(xmm6, xmm4, 0xb1);
1799 mulps(xmm4, xmm0);
1800 addps(xmm10, xmm4);
1801 movaps(xmm4, xword[BO - 0x60]);
1802 mulps(xmm6, xmm0);
1803 movsd(xmm0, qword[AO - 0x70]);
1804 addps(xmm11, xmm6);
1805 pshufd(xmm6, xmm5, 0xb1);
1806 mulps(xmm5, xmm2);
1807 addps(xmm8, xmm5);
1808 pshufd(xmm5, xmm6, 0x1b);
1809 mulps(xmm6, xmm2);
1810 addps(xmm9, xmm6);
1811 pshufd(xmm6, xmm5, 0xb1);
1812 mulps(xmm5, xmm2);
1813 addps(xmm10, xmm5);
1814 movaps(xmm5, xword[BO - 0x50]);
1815 mulps(xmm6, xmm2);
1816 movsd(xmm2, qword[AO - 0x68]);
1817 addps(xmm11, xmm6);
1818 pshufd(xmm6, xmm4, 0xb1);
1819 mulps(xmm4, xmm0);
1820 addps(xmm8, xmm4);
1821 pshufd(xmm4, xmm6, 0x1b);
1822 mulps(xmm6, xmm0);
1823 addps(xmm9, xmm6);
1824 pshufd(xmm6, xmm4, 0xb1);
1825 mulps(xmm4, xmm0);
1826 addps(xmm10, xmm4);
1827 movaps(xmm4, xword[BO - 0x40]);
1828 mulps(xmm6, xmm0);
1829 movsd(xmm0, qword[AO - 0x60]);
1830 addps(xmm11, xmm6);
1831 pshufd(xmm6, xmm5, 0xb1);
1832 mulps(xmm5, xmm2);
1833 addps(xmm8, xmm5);
1834 pshufd(xmm5, xmm6, 0x1b);
1835 mulps(xmm6, xmm2);
1836 addps(xmm9, xmm6);
1837 pshufd(xmm6, xmm5, 0xb1);
1838 mulps(xmm5, xmm2);
1839 addps(xmm10, xmm5);
1840 movaps(xmm5, xword[BO - 0x30]);
1841 add(AA, 0x8);
1842 sub(BO, -64);
1843 mulps(xmm6, xmm2);
1844 movsd(xmm2, qword[AO - 0x58]);
1845 sub(AO, -32);
1846 addps(xmm11, xmm6);
1847 prefetcht0(byte[AA - 0x78]);
1848 sub(H, 0x1);
1849 jg(labels[22], T_NEAR);
1850 align(4);
1851
1852 L(labels[23]);
1853 prefetcht0(byte[CO1 + 0x4]);
1854 prefetcht0(byte[CO1 + LDC * 1 + 0x4]);
1855 prefetcht0(byte[CO2 + 0x4]);
1856 prefetcht0(byte[CO2 + LDC * 1 + 0x4]);
1857 add(H, 0x1e);
1858 align(4);
1859
1860 L(labels[24]);
1861 prefetcht0(byte[AO + 0x180]);
1862 prefetcht0(byte[BO + 0x100]);
1863 pshufd(xmm6, xmm4, 0xb1);
1864 mulps(xmm4, xmm0);
1865 addps(xmm8, xmm4);
1866 pshufd(xmm4, xmm6, 0x1b);
1867 mulps(xmm6, xmm0);
1868 addps(xmm9, xmm6);
1869 pshufd(xmm6, xmm4, 0xb1);
1870 mulps(xmm4, xmm0);
1871 addps(xmm10, xmm4);
1872 movaps(xmm4, xword[BO - 0x60]);
1873 mulps(xmm6, xmm0);
1874 movsd(xmm0, qword[AO - 0x70]);
1875 addps(xmm11, xmm6);
1876 pshufd(xmm6, xmm5, 0xb1);
1877 mulps(xmm5, xmm2);
1878 addps(xmm8, xmm5);
1879 pshufd(xmm5, xmm6, 0x1b);
1880 mulps(xmm6, xmm2);
1881 addps(xmm9, xmm6);
1882 pshufd(xmm6, xmm5, 0xb1);
1883 mulps(xmm5, xmm2);
1884 addps(xmm10, xmm5);
1885 movaps(xmm5, xword[BO - 0x50]);
1886 mulps(xmm6, xmm2);
1887 movsd(xmm2, qword[AO - 0x68]);
1888 addps(xmm11, xmm6);
1889 pshufd(xmm6, xmm4, 0xb1);
1890 mulps(xmm4, xmm0);
1891 addps(xmm8, xmm4);
1892 pshufd(xmm4, xmm6, 0x1b);
1893 mulps(xmm6, xmm0);
1894 addps(xmm9, xmm6);
1895 pshufd(xmm6, xmm4, 0xb1);
1896 mulps(xmm4, xmm0);
1897 addps(xmm10, xmm4);
1898 movaps(xmm4, xword[BO - 0x40]);
1899 mulps(xmm6, xmm0);
1900 movsd(xmm0, qword[AO - 0x60]);
1901 addps(xmm11, xmm6);
1902 pshufd(xmm6, xmm5, 0xb1);
1903 mulps(xmm5, xmm2);
1904 addps(xmm8, xmm5);
1905 pshufd(xmm5, xmm6, 0x1b);
1906 mulps(xmm6, xmm2);
1907 addps(xmm9, xmm6);
1908 pshufd(xmm6, xmm5, 0xb1);
1909 mulps(xmm5, xmm2);
1910 addps(xmm10, xmm5);
1911 movaps(xmm5, xword[BO - 0x30]);
1912 add(AA, 0x8);
1913 sub(BO, -64);
1914 mulps(xmm6, xmm2);
1915 movsd(xmm2, qword[AO - 0x58]);
1916 sub(AO, -32);
1917 addps(xmm11, xmm6);
1918 prefetcht0(byte[AA - 0x78]);
1919 sub(H, 0x1);
1920 jg(labels[24], T_NEAR);
1921 align(4);
1922
1923 L(labels[25]);
1924 mov(H, K);
1925 and_(H, 0x3);
1926 je(labels[27], T_NEAR);
1927 align(4);
1928
1929 L(labels[26]);
1930 pshufd(xmm6, xmm4, 0xb1);
1931 movaps(xmm7, xmm4);
1932 mulps(xmm4, xmm0);
1933 addps(xmm8, xmm4);
1934 pshufd(xmm4, xmm6, 0x1b);
1935 movaps(xmm7, xmm6);
1936 mulps(xmm6, xmm0);
1937 addps(xmm9, xmm6);
1938 pshufd(xmm6, xmm4, 0xb1);
1939 movaps(xmm7, xmm4);
1940 mulps(xmm4, xmm0);
1941 addps(xmm10, xmm4);
1942 movaps(xmm4, xword[BO - 0x70]);
1943 movaps(xmm7, xmm6);
1944 mulps(xmm6, xmm0);
1945 movsd(xmm0, qword[AO - 0x78]);
1946 addps(xmm11, xmm6);
1947 sub(AO, -8);
1948 sub(BO, -16);
1949 dec(H);
1950 jg(labels[26], T_NEAR);
1951 align(4);
1952
1953 L(labels[27]);
1954 movaps(xmm0, xmm8);
1955 unpcklpd(xmm8, xmm9);
1956 unpckhpd(xmm0, xmm9);
1957 movaps(xmm1, xmm10);
1958 unpckhpd(xmm10, xmm11);
1959 unpcklpd(xmm1, xmm11);
1960 movaps(xmm9, xmm8);
1961 shufps(xmm8, xmm10, 0xcc);
1962 shufps(xmm9, xmm10, 0x66);
1963 movaps(xmm10, xmm1);
1964 movaps(xmm11, xmm1);
1965 shufps(xmm10, xmm0, 0xcc);
1966 shufps(xmm11, xmm0, 0x66);
1967 movlps(qword[CO1 + 0x0], xmm8);
1968 movlps(qword[CO1 + LDC * 1 + 0x0], xmm9);
1969 movlps(qword[CO2], xmm10);
1970 movlps(qword[CO2 + LDC * 1], xmm11);
1971 lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
1972 lea(CO2, ptr[CO2 + LDC * 4]);
1973 sub(I, 0x4);
1974 cmp(I, 0x4);
1975 jge(labels[21], T_NEAR);
1976 align(4);
1977
1978 L(labels[28]);
1979 test(I, 0x2);
1980 jle(labels[35], T_NEAR);
1981 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
1982 movsd(xmm0, qword[A - 0x80]);
1983 xorps(xmm8, xmm8);
1984 xorps(xmm9, xmm9);
1985 movsd(xmm2, qword[A - 0x78]);
1986 xorps(xmm10, xmm10);
1987 xorps(xmm11, xmm11);
1988 movddup(xmm4, qword[BO - 0x80]);
1989 xorps(xmm12, xmm12);
1990 movddup(xmm5, qword[BO - 0x78]);
1991 xorps(xmm13, xmm13);
1992 xorps(xmm14, xmm14);
1993 xorps(xmm15, xmm15);
1994 mov(AO, A);
1995 mov(H, K);
1996 sar(H, 0x2);
1997 jle(labels[32], T_NEAR);
1998 sub(H, 0x1e);
1999 jle(labels[30], T_NEAR);
2000 align(4);
2001
2002 L(labels[29]);
2003 prefetcht0(byte[AO + 0x180]);
2004 prefetcht0(byte[BO + 0x100]);
2005 pshufd(xmm6, xmm4, 0xb1);
2006 mulps(xmm4, xmm0);
2007 addps(xmm8, xmm4);
2008 pshufd(xmm4, xmm6, 0x1b);
2009 mulps(xmm6, xmm0);
2010 addps(xmm9, xmm6);
2011 pshufd(xmm6, xmm4, 0xb1);
2012 mulps(xmm4, xmm0);
2013 addps(xmm10, xmm4);
2014 movddup(xmm4, qword[BO - 0x70]);
2015 mulps(xmm6, xmm0);
2016 movsd(xmm0, qword[AO - 0x70]);
2017 addps(xmm11, xmm6);
2018 pshufd(xmm6, xmm5, 0xb1);
2019 mulps(xmm5, xmm2);
2020 addps(xmm8, xmm5);
2021 pshufd(xmm5, xmm6, 0x1b);
2022 mulps(xmm6, xmm2);
2023 addps(xmm9, xmm6);
2024 pshufd(xmm6, xmm5, 0xb1);
2025 mulps(xmm5, xmm2);
2026 addps(xmm10, xmm5);
2027 movddup(xmm5, qword[BO - 0x68]);
2028 mulps(xmm6, xmm2);
2029 movsd(xmm2, qword[AO - 0x68]);
2030 addps(xmm11, xmm6);
2031 pshufd(xmm6, xmm4, 0xb1);
2032 mulps(xmm4, xmm0);
2033 addps(xmm8, xmm4);
2034 pshufd(xmm4, xmm6, 0x1b);
2035 mulps(xmm6, xmm0);
2036 addps(xmm9, xmm6);
2037 pshufd(xmm6, xmm4, 0xb1);
2038 mulps(xmm4, xmm0);
2039 addps(xmm10, xmm4);
2040 movddup(xmm4, qword[BO - 0x60]);
2041 mulps(xmm6, xmm0);
2042 movsd(xmm0, qword[AO - 0x60]);
2043 addps(xmm11, xmm6);
2044 pshufd(xmm6, xmm5, 0xb1);
2045 mulps(xmm5, xmm2);
2046 addps(xmm8, xmm5);
2047 pshufd(xmm5, xmm6, 0x1b);
2048 mulps(xmm6, xmm2);
2049 addps(xmm9, xmm6);
2050 pshufd(xmm6, xmm5, 0xb1);
2051 mulps(xmm5, xmm2);
2052 addps(xmm10, xmm5);
2053 movddup(xmm5, qword[BO - 0x58]);
2054 add(AA, 0x8);
2055 sub(BO, -32);
2056 mulps(xmm6, xmm2);
2057 movsd(xmm2, qword[AO - 0x58]);
2058 sub(AO, -32);
2059 addps(xmm11, xmm6);
2060 prefetcht0(byte[AA - 0x78]);
2061 sub(H, 0x1);
2062 jg(labels[29], T_NEAR);
2063 align(4);
2064
2065 L(labels[30]);
2066 prefetcht0(byte[CO1 + 0x4]);
2067 prefetcht0(byte[CO1 + LDC * 1 + 0x4]);
2068 add(H, 0x1e);
2069 align(4);
2070
2071 L(labels[31]);
2072 prefetcht0(byte[AO + 0x180]);
2073 prefetcht0(byte[BO + 0x100]);
2074 pshufd(xmm6, xmm4, 0xb1);
2075 mulps(xmm4, xmm0);
2076 addps(xmm8, xmm4);
2077 pshufd(xmm4, xmm6, 0x1b);
2078 mulps(xmm6, xmm0);
2079 addps(xmm9, xmm6);
2080 pshufd(xmm6, xmm4, 0xb1);
2081 mulps(xmm4, xmm0);
2082 addps(xmm10, xmm4);
2083 movddup(xmm4, qword[BO - 0x70]);
2084 mulps(xmm6, xmm0);
2085 movsd(xmm0, qword[AO - 0x70]);
2086 addps(xmm11, xmm6);
2087 pshufd(xmm6, xmm5, 0xb1);
2088 mulps(xmm5, xmm2);
2089 addps(xmm8, xmm5);
2090 pshufd(xmm5, xmm6, 0x1b);
2091 mulps(xmm6, xmm2);
2092 addps(xmm9, xmm6);
2093 pshufd(xmm6, xmm5, 0xb1);
2094 mulps(xmm5, xmm2);
2095 addps(xmm10, xmm5);
2096 movddup(xmm5, qword[BO - 0x68]);
2097 mulps(xmm6, xmm2);
2098 movsd(xmm2, qword[AO - 0x68]);
2099 addps(xmm11, xmm6);
2100 pshufd(xmm6, xmm4, 0xb1);
2101 mulps(xmm4, xmm0);
2102 addps(xmm8, xmm4);
2103 pshufd(xmm4, xmm6, 0x1b);
2104 mulps(xmm6, xmm0);
2105 addps(xmm9, xmm6);
2106 pshufd(xmm6, xmm4, 0xb1);
2107 mulps(xmm4, xmm0);
2108 addps(xmm10, xmm4);
2109 movddup(xmm4, qword[BO - 0x60]);
2110 mulps(xmm6, xmm0);
2111 movsd(xmm0, qword[AO - 0x60]);
2112 addps(xmm11, xmm6);
2113 pshufd(xmm6, xmm5, 0xb1);
2114 mulps(xmm5, xmm2);
2115 addps(xmm8, xmm5);
2116 pshufd(xmm5, xmm6, 0x1b);
2117 mulps(xmm6, xmm2);
2118 addps(xmm9, xmm6);
2119 pshufd(xmm6, xmm5, 0xb1);
2120 mulps(xmm5, xmm2);
2121 addps(xmm10, xmm5);
2122 movddup(xmm5, qword[BO - 0x58]);
2123 add(AA, 0x8);
2124 sub(BO, -32);
2125 mulps(xmm6, xmm2);
2126 movsd(xmm2, qword[AO - 0x58]);
2127 sub(AO, -32);
2128 addps(xmm11, xmm6);
2129 prefetcht0(byte[AA - 0x78]);
2130 sub(H, 0x1);
2131 jg(labels[31], T_NEAR);
2132 align(4);
2133
2134 L(labels[32]);
2135 mov(H, K);
2136 and_(H, 0x3);
2137 je(labels[34], T_NEAR);
2138 align(4);
2139
2140 L(labels[33]);
2141 pshufd(xmm6, xmm4, 0xb1);
2142 movaps(xmm7, xmm4);
2143 mulps(xmm4, xmm0);
2144 addps(xmm8, xmm4);
2145 pshufd(xmm4, xmm6, 0x1b);
2146 movaps(xmm7, xmm6);
2147 mulps(xmm6, xmm0);
2148 addps(xmm9, xmm6);
2149 pshufd(xmm6, xmm4, 0xb1);
2150 movaps(xmm7, xmm4);
2151 mulps(xmm4, xmm0);
2152 addps(xmm10, xmm4);
2153 movddup(xmm4, qword[BO - 0x78]);
2154 movaps(xmm7, xmm6);
2155 mulps(xmm6, xmm0);
2156 movsd(xmm0, qword[AO - 0x78]);
2157 addps(xmm11, xmm6);
2158 sub(AO, -8);
2159 sub(BO, -8);
2160 dec(H);
2161 jg(labels[33], T_NEAR);
2162 align(4);
2163
2164 L(labels[34]);
2165 movaps(xmm0, xmm8);
2166 unpcklpd(xmm8, xmm9);
2167 unpckhpd(xmm0, xmm9);
2168 movaps(xmm1, xmm10);
2169 unpckhpd(xmm10, xmm11);
2170 unpcklpd(xmm1, xmm11);
2171 movaps(xmm9, xmm8);
2172 shufps(xmm8, xmm10, 0xcc);
2173 shufps(xmm9, xmm10, 0x66);
2174 movlps(qword[CO1 + 0x0], xmm8);
2175 movlps(qword[CO1 + LDC * 1 + 0x0], xmm9);
2176 lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
2177 lea(CO2, ptr[CO2 + LDC * 2]);
2178 align(4);
2179
2180 L(labels[35]);
2181 test(I, 0x1);
2182 jle(labels[42], T_NEAR);
2183 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
2184 movsd(xmm0, qword[A - 0x80]);
2185 xorps(xmm8, xmm8);
2186 xorps(xmm9, xmm9);
2187 movsd(xmm2, qword[A - 0x78]);
2188 xorps(xmm10, xmm10);
2189 xorps(xmm11, xmm11);
2190 movss(xmm4, dword[BO - 0x80]);
2191 xorps(xmm12, xmm12);
2192 movss(xmm5, dword[BO - 0x7c]);
2193 xorps(xmm13, xmm13);
2194 xorps(xmm14, xmm14);
2195 xorps(xmm15, xmm15);
2196 mov(AO, A);
2197 mov(H, K);
2198 sar(H, 0x2);
2199 jle(labels[39], T_NEAR);
2200 sub(H, 0x1e);
2201 jle(labels[37], T_NEAR);
2202 align(4);
2203
2204 L(labels[36]);
2205 prefetcht0(byte[AO + 0x180]);
2206 prefetcht0(byte[BO + 0x100]);
2207 pshufd(xmm6, xmm4, 0xb1);
2208 mulps(xmm4, xmm0);
2209 addps(xmm8, xmm4);
2210 pshufd(xmm4, xmm6, 0x1b);
2211 mulps(xmm6, xmm0);
2212 addps(xmm9, xmm6);
2213 pshufd(xmm6, xmm4, 0xb1);
2214 mulps(xmm4, xmm0);
2215 addps(xmm10, xmm4);
2216 movss(xmm4, dword[BO - 0x78]);
2217 mulps(xmm6, xmm0);
2218 movsd(xmm0, qword[AO - 0x70]);
2219 addps(xmm11, xmm6);
2220 pshufd(xmm6, xmm5, 0xb1);
2221 mulps(xmm5, xmm2);
2222 addps(xmm8, xmm5);
2223 pshufd(xmm5, xmm6, 0x1b);
2224 mulps(xmm6, xmm2);
2225 addps(xmm9, xmm6);
2226 pshufd(xmm6, xmm5, 0xb1);
2227 mulps(xmm5, xmm2);
2228 addps(xmm10, xmm5);
2229 movss(xmm5, dword[BO - 0x74]);
2230 mulps(xmm6, xmm2);
2231 movsd(xmm2, qword[AO - 0x68]);
2232 addps(xmm11, xmm6);
2233 pshufd(xmm6, xmm4, 0xb1);
2234 mulps(xmm4, xmm0);
2235 addps(xmm8, xmm4);
2236 pshufd(xmm4, xmm6, 0x1b);
2237 mulps(xmm6, xmm0);
2238 addps(xmm9, xmm6);
2239 pshufd(xmm6, xmm4, 0xb1);
2240 mulps(xmm4, xmm0);
2241 addps(xmm10, xmm4);
2242 movss(xmm4, dword[BO - 0x70]);
2243 mulps(xmm6, xmm0);
2244 movsd(xmm0, qword[AO - 0x60]);
2245 addps(xmm11, xmm6);
2246 pshufd(xmm6, xmm5, 0xb1);
2247 mulps(xmm5, xmm2);
2248 addps(xmm8, xmm5);
2249 pshufd(xmm5, xmm6, 0x1b);
2250 mulps(xmm6, xmm2);
2251 addps(xmm9, xmm6);
2252 pshufd(xmm6, xmm5, 0xb1);
2253 mulps(xmm5, xmm2);
2254 addps(xmm10, xmm5);
2255 movss(xmm5, dword[BO - 0x6c]);
2256 add(AA, 0x8);
2257 sub(BO, -16);
2258 mulps(xmm6, xmm2);
2259 movsd(xmm2, qword[AO - 0x58]);
2260 sub(AO, -32);
2261 addps(xmm11, xmm6);
2262 prefetcht0(byte[AA - 0x78]);
2263 sub(H, 0x1);
2264 jg(labels[36], T_NEAR);
2265 align(4);
2266
2267 L(labels[37]);
2268 prefetcht0(byte[CO1 + 0x4]);
2269 add(H, 0x1e);
2270 align(4);
2271
2272 L(labels[38]);
2273 prefetcht0(byte[AO + 0x180]);
2274 prefetcht0(byte[BO + 0x100]);
2275 pshufd(xmm6, xmm4, 0xb1);
2276 mulps(xmm4, xmm0);
2277 addps(xmm8, xmm4);
2278 pshufd(xmm4, xmm6, 0x1b);
2279 mulps(xmm6, xmm0);
2280 addps(xmm9, xmm6);
2281 pshufd(xmm6, xmm4, 0xb1);
2282 mulps(xmm4, xmm0);
2283 addps(xmm10, xmm4);
2284 movss(xmm4, dword[BO - 0x78]);
2285 mulps(xmm6, xmm0);
2286 movsd(xmm0, qword[AO - 0x70]);
2287 addps(xmm11, xmm6);
2288 pshufd(xmm6, xmm5, 0xb1);
2289 mulps(xmm5, xmm2);
2290 addps(xmm8, xmm5);
2291 pshufd(xmm5, xmm6, 0x1b);
2292 mulps(xmm6, xmm2);
2293 addps(xmm9, xmm6);
2294 pshufd(xmm6, xmm5, 0xb1);
2295 mulps(xmm5, xmm2);
2296 addps(xmm10, xmm5);
2297 movss(xmm5, dword[BO - 0x74]);
2298 mulps(xmm6, xmm2);
2299 movsd(xmm2, qword[AO - 0x68]);
2300 addps(xmm11, xmm6);
2301 pshufd(xmm6, xmm4, 0xb1);
2302 mulps(xmm4, xmm0);
2303 addps(xmm8, xmm4);
2304 pshufd(xmm4, xmm6, 0x1b);
2305 mulps(xmm6, xmm0);
2306 addps(xmm9, xmm6);
2307 pshufd(xmm6, xmm4, 0xb1);
2308 mulps(xmm4, xmm0);
2309 addps(xmm10, xmm4);
2310 movss(xmm4, dword[BO - 0x70]);
2311 mulps(xmm6, xmm0);
2312 movsd(xmm0, qword[AO - 0x60]);
2313 addps(xmm11, xmm6);
2314 pshufd(xmm6, xmm5, 0xb1);
2315 mulps(xmm5, xmm2);
2316 addps(xmm8, xmm5);
2317 pshufd(xmm5, xmm6, 0x1b);
2318 mulps(xmm6, xmm2);
2319 addps(xmm9, xmm6);
2320 pshufd(xmm6, xmm5, 0xb1);
2321 mulps(xmm5, xmm2);
2322 addps(xmm10, xmm5);
2323 movss(xmm5, dword[BO - 0x6c]);
2324 add(AA, 0x8);
2325 sub(BO, -16);
2326 mulps(xmm6, xmm2);
2327 movsd(xmm2, qword[AO - 0x58]);
2328 sub(AO, -32);
2329 addps(xmm11, xmm6);
2330 prefetcht0(byte[AA - 0x78]);
2331 sub(H, 0x1);
2332 jg(labels[38], T_NEAR);
2333 align(4);
2334
2335 L(labels[39]);
2336 mov(H, K);
2337 and_(H, 0x3);
2338 je(labels[41], T_NEAR);
2339 align(4);
2340
2341 L(labels[40]);
2342 pshufd(xmm6, xmm4, 0xb1);
2343 movaps(xmm7, xmm4);
2344 mulps(xmm4, xmm0);
2345 addps(xmm8, xmm4);
2346 pshufd(xmm4, xmm6, 0x1b);
2347 movaps(xmm7, xmm6);
2348 mulps(xmm6, xmm0);
2349 addps(xmm9, xmm6);
2350 pshufd(xmm6, xmm4, 0xb1);
2351 movaps(xmm7, xmm4);
2352 mulps(xmm4, xmm0);
2353 addps(xmm10, xmm4);
2354 movss(xmm4, dword[BO - 0x7c]);
2355 movaps(xmm7, xmm6);
2356 mulps(xmm6, xmm0);
2357 movsd(xmm0, qword[AO - 0x78]);
2358 addps(xmm11, xmm6);
2359 sub(AO, -8);
2360 sub(BO, -4);
2361 dec(H);
2362 jg(labels[40], T_NEAR);
2363 align(4);
2364
2365 L(labels[41]);
2366 movaps(xmm0, xmm8);
2367 unpcklpd(xmm8, xmm9);
2368 unpckhpd(xmm0, xmm9);
2369 movaps(xmm1, xmm10);
2370 unpckhpd(xmm10, xmm11);
2371 unpcklpd(xmm1, xmm11);
2372 movaps(xmm9, xmm8);
2373 shufps(xmm8, xmm10, 0xcc);
2374 shufps(xmm9, xmm10, 0x66);
2375 movlps(qword[CO1 + 0x0], xmm8);
2376 lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
2377 lea(CO2, ptr[CO2 + LDC * 1]);
2378 align(4);
2379
2380 L(labels[42]);
2381 mov(A, AO);
2382 align(4);
2383
2384 L(labels[43]);
2385 test(J, 0x1);
2386 jle(labels[68], T_NEAR);
2387 mov(AA, K);
2388 imul(AA, AA, 0x4);
2389 add(AA, A);
2390 mov(CO1, C);
2391 add(C, 0x4);
2392 mov(BO, B);
2393 mov(I, N);
2394 cmp(I, 0x4);
2395 jl(labels[51], T_NEAR);
2396 align(4);
2397
2398 L(labels[44]);
2399 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
2400 movss(xmm0, dword[A - 0x80]);
2401 xorps(xmm8, xmm8);
2402 xorps(xmm9, xmm9);
2403 movss(xmm2, dword[A - 0x7c]);
2404 xorps(xmm10, xmm10);
2405 xorps(xmm11, xmm11);
2406 movaps(xmm4, xword[BO - 0x80]);
2407 xorps(xmm12, xmm12);
2408 movaps(xmm5, xword[BO - 0x70]);
2409 xorps(xmm13, xmm13);
2410 xorps(xmm14, xmm14);
2411 xorps(xmm15, xmm15);
2412 mov(AO, A);
2413 mov(H, K);
2414 sar(H, 0x2);
2415 jle(labels[48], T_NEAR);
2416 sub(H, 0x1e);
2417 jle(labels[46], T_NEAR);
2418 align(4);
2419
2420 L(labels[45]);
2421 prefetcht0(byte[AO + 0x180]);
2422 prefetcht0(byte[BO + 0x100]);
2423 pshufd(xmm6, xmm4, 0xb1);
2424 mulps(xmm4, xmm0);
2425 addps(xmm8, xmm4);
2426 pshufd(xmm4, xmm6, 0x1b);
2427 mulps(xmm6, xmm0);
2428 addps(xmm9, xmm6);
2429 pshufd(xmm6, xmm4, 0xb1);
2430 mulps(xmm4, xmm0);
2431 addps(xmm10, xmm4);
2432 movaps(xmm4, xword[BO - 0x60]);
2433 mulps(xmm6, xmm0);
2434 movss(xmm0, dword[AO - 0x78]);
2435 addps(xmm11, xmm6);
2436 pshufd(xmm6, xmm5, 0xb1);
2437 mulps(xmm5, xmm2);
2438 addps(xmm8, xmm5);
2439 pshufd(xmm5, xmm6, 0x1b);
2440 mulps(xmm6, xmm2);
2441 addps(xmm9, xmm6);
2442 pshufd(xmm6, xmm5, 0xb1);
2443 mulps(xmm5, xmm2);
2444 addps(xmm10, xmm5);
2445 movaps(xmm5, xword[BO - 0x50]);
2446 mulps(xmm6, xmm2);
2447 movss(xmm2, dword[AO - 0x74]);
2448 addps(xmm11, xmm6);
2449 pshufd(xmm6, xmm4, 0xb1);
2450 mulps(xmm4, xmm0);
2451 addps(xmm8, xmm4);
2452 pshufd(xmm4, xmm6, 0x1b);
2453 mulps(xmm6, xmm0);
2454 addps(xmm9, xmm6);
2455 pshufd(xmm6, xmm4, 0xb1);
2456 mulps(xmm4, xmm0);
2457 addps(xmm10, xmm4);
2458 movaps(xmm4, xword[BO - 0x40]);
2459 mulps(xmm6, xmm0);
2460 movss(xmm0, dword[AO - 0x70]);
2461 addps(xmm11, xmm6);
2462 pshufd(xmm6, xmm5, 0xb1);
2463 mulps(xmm5, xmm2);
2464 addps(xmm8, xmm5);
2465 pshufd(xmm5, xmm6, 0x1b);
2466 mulps(xmm6, xmm2);
2467 addps(xmm9, xmm6);
2468 pshufd(xmm6, xmm5, 0xb1);
2469 mulps(xmm5, xmm2);
2470 addps(xmm10, xmm5);
2471 movaps(xmm5, xword[BO - 0x30]);
2472 add(AA, 0x8);
2473 sub(BO, -64);
2474 mulps(xmm6, xmm2);
2475 movss(xmm2, dword[AO - 0x6c]);
2476 sub(AO, -16);
2477 addps(xmm11, xmm6);
2478 prefetcht0(byte[AA - 0x78]);
2479 sub(H, 0x1);
2480 jg(labels[45], T_NEAR);
2481 align(4);
2482
2483 L(labels[46]);
2484 prefetcht0(byte[CO1 + 0x0]);
2485 prefetcht0(byte[CO1 + LDC * 1 + 0x0]);
2486 prefetcht0(byte[CO2]);
2487 prefetcht0(byte[CO2 + LDC * 1]);
2488 add(H, 0x1e);
2489 align(4);
2490
2491 L(labels[47]);
2492 prefetcht0(byte[AO + 0x180]);
2493 prefetcht0(byte[BO + 0x100]);
2494 pshufd(xmm6, xmm4, 0xb1);
2495 mulps(xmm4, xmm0);
2496 addps(xmm8, xmm4);
2497 pshufd(xmm4, xmm6, 0x1b);
2498 mulps(xmm6, xmm0);
2499 addps(xmm9, xmm6);
2500 pshufd(xmm6, xmm4, 0xb1);
2501 mulps(xmm4, xmm0);
2502 addps(xmm10, xmm4);
2503 movaps(xmm4, xword[BO - 0x60]);
2504 mulps(xmm6, xmm0);
2505 movss(xmm0, dword[AO - 0x78]);
2506 addps(xmm11, xmm6);
2507 pshufd(xmm6, xmm5, 0xb1);
2508 mulps(xmm5, xmm2);
2509 addps(xmm8, xmm5);
2510 pshufd(xmm5, xmm6, 0x1b);
2511 mulps(xmm6, xmm2);
2512 addps(xmm9, xmm6);
2513 pshufd(xmm6, xmm5, 0xb1);
2514 mulps(xmm5, xmm2);
2515 addps(xmm10, xmm5);
2516 movaps(xmm5, xword[BO - 0x50]);
2517 mulps(xmm6, xmm2);
2518 movss(xmm2, dword[AO - 0x74]);
2519 addps(xmm11, xmm6);
2520 pshufd(xmm6, xmm4, 0xb1);
2521 mulps(xmm4, xmm0);
2522 addps(xmm8, xmm4);
2523 pshufd(xmm4, xmm6, 0x1b);
2524 mulps(xmm6, xmm0);
2525 addps(xmm9, xmm6);
2526 pshufd(xmm6, xmm4, 0xb1);
2527 mulps(xmm4, xmm0);
2528 addps(xmm10, xmm4);
2529 movaps(xmm4, xword[BO - 0x40]);
2530 mulps(xmm6, xmm0);
2531 movss(xmm0, dword[AO - 0x70]);
2532 addps(xmm11, xmm6);
2533 pshufd(xmm6, xmm5, 0xb1);
2534 mulps(xmm5, xmm2);
2535 addps(xmm8, xmm5);
2536 pshufd(xmm5, xmm6, 0x1b);
2537 mulps(xmm6, xmm2);
2538 addps(xmm9, xmm6);
2539 pshufd(xmm6, xmm5, 0xb1);
2540 mulps(xmm5, xmm2);
2541 addps(xmm10, xmm5);
2542 movaps(xmm5, xword[BO - 0x30]);
2543 add(AA, 0x8);
2544 sub(BO, -64);
2545 mulps(xmm6, xmm2);
2546 movss(xmm2, dword[AO - 0x6c]);
2547 sub(AO, -16);
2548 addps(xmm11, xmm6);
2549 prefetcht0(byte[AA - 0x78]);
2550 sub(H, 0x1);
2551 jg(labels[47], T_NEAR);
2552 align(4);
2553
2554 L(labels[48]);
2555 mov(H, K);
2556 and_(H, 0x3);
2557 je(labels[50], T_NEAR);
2558 align(4);
2559
2560 L(labels[49]);
2561 pshufd(xmm6, xmm4, 0xb1);
2562 movaps(xmm7, xmm4);
2563 mulps(xmm4, xmm0);
2564 addps(xmm8, xmm4);
2565 pshufd(xmm4, xmm6, 0x1b);
2566 movaps(xmm7, xmm6);
2567 mulps(xmm6, xmm0);
2568 addps(xmm9, xmm6);
2569 pshufd(xmm6, xmm4, 0xb1);
2570 movaps(xmm7, xmm4);
2571 mulps(xmm4, xmm0);
2572 addps(xmm10, xmm4);
2573 movaps(xmm4, xword[BO - 0x70]);
2574 movaps(xmm7, xmm6);
2575 mulps(xmm6, xmm0);
2576 movss(xmm0, dword[AO - 0x7c]);
2577 addps(xmm11, xmm6);
2578 sub(AO, -4);
2579 sub(BO, -16);
2580 dec(H);
2581 jg(labels[49], T_NEAR);
2582 align(4);
2583
2584 L(labels[50]);
2585 movaps(xmm0, xmm8);
2586 unpcklpd(xmm8, xmm9);
2587 unpckhpd(xmm0, xmm9);
2588 movaps(xmm1, xmm10);
2589 unpckhpd(xmm10, xmm11);
2590 unpcklpd(xmm1, xmm11);
2591 movaps(xmm9, xmm8);
2592 shufps(xmm8, xmm10, 0xcc);
2593 shufps(xmm9, xmm10, 0x66);
2594 movaps(xmm10, xmm1);
2595 movaps(xmm11, xmm1);
2596 shufps(xmm10, xmm0, 0xcc);
2597 shufps(xmm11, xmm0, 0x66);
2598 movss(dword[CO1 + 0x0], xmm8);
2599 movss(dword[CO1 + LDC * 1 + 0x0], xmm9);
2600 movss(dword[CO2], xmm10);
2601 movss(dword[CO2 + LDC * 1], xmm11);
2602 lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
2603 lea(CO2, ptr[CO2 + LDC * 4]);
2604 sub(I, 0x4);
2605 cmp(I, 0x4);
2606 jge(labels[44], T_NEAR);
2607 align(4);
2608
2609 L(labels[51]);
2610 test(I, 0x2);
2611 jle(labels[59], T_NEAR);
2612 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
2613 movss(xmm0, dword[A - 0x80]);
2614 xorps(xmm8, xmm8);
2615 xorps(xmm9, xmm9);
2616 movss(xmm2, dword[A - 0x7c]);
2617 xorps(xmm10, xmm10);
2618 xorps(xmm11, xmm11);
2619 movddup(xmm4, qword[BO - 0x80]);
2620 xorps(xmm12, xmm12);
2621 movddup(xmm5, qword[BO - 0x78]);
2622 xorps(xmm13, xmm13);
2623 xorps(xmm14, xmm14);
2624 xorps(xmm15, xmm15);
2625 mov(AO, A);
2626 mov(H, K);
2627 sar(H, 0x2);
2628 jle(labels[55], T_NEAR);
2629 sub(H, 0x1e);
2630 jle(labels[53], T_NEAR);
2631 align(4);
2632
2633 L(labels[52]);
2634 prefetcht0(byte[AO + 0x180]);
2635 prefetcht0(byte[BO + 0x100]);
2636 pshufd(xmm6, xmm4, 0xb1);
2637 mulps(xmm4, xmm0);
2638 addps(xmm8, xmm4);
2639 pshufd(xmm4, xmm6, 0x1b);
2640 mulps(xmm6, xmm0);
2641 addps(xmm9, xmm6);
2642 pshufd(xmm6, xmm4, 0xb1);
2643 mulps(xmm4, xmm0);
2644 addps(xmm10, xmm4);
2645 movddup(xmm4, qword[BO - 0x70]);
2646 mulps(xmm6, xmm0);
2647 movss(xmm0, dword[AO - 0x78]);
2648 addps(xmm11, xmm6);
2649 pshufd(xmm6, xmm5, 0xb1);
2650 mulps(xmm5, xmm2);
2651 addps(xmm8, xmm5);
2652 pshufd(xmm5, xmm6, 0x1b);
2653 mulps(xmm6, xmm2);
2654 addps(xmm9, xmm6);
2655 pshufd(xmm6, xmm5, 0xb1);
2656 mulps(xmm5, xmm2);
2657 addps(xmm10, xmm5);
2658 movddup(xmm5, qword[BO - 0x68]);
2659 mulps(xmm6, xmm2);
2660 movss(xmm2, dword[AO - 0x74]);
2661 addps(xmm11, xmm6);
2662 pshufd(xmm6, xmm4, 0xb1);
2663 mulps(xmm4, xmm0);
2664 addps(xmm8, xmm4);
2665 pshufd(xmm4, xmm6, 0x1b);
2666 mulps(xmm6, xmm0);
2667 addps(xmm9, xmm6);
2668 pshufd(xmm6, xmm4, 0xb1);
2669 mulps(xmm4, xmm0);
2670 addps(xmm10, xmm4);
2671 movddup(xmm4, qword[BO - 0x60]);
2672 mulps(xmm6, xmm0);
2673 movss(xmm0, dword[AO - 0x70]);
2674 addps(xmm11, xmm6);
2675 pshufd(xmm6, xmm5, 0xb1);
2676 mulps(xmm5, xmm2);
2677 addps(xmm8, xmm5);
2678 pshufd(xmm5, xmm6, 0x1b);
2679 mulps(xmm6, xmm2);
2680 addps(xmm9, xmm6);
2681 pshufd(xmm6, xmm5, 0xb1);
2682 mulps(xmm5, xmm2);
2683 addps(xmm10, xmm5);
2684 movddup(xmm5, qword[BO - 0x58]);
2685 add(AA, 0x8);
2686 sub(BO, -32);
2687 mulps(xmm6, xmm2);
2688 movss(xmm2, dword[AO - 0x6c]);
2689 sub(AO, -16);
2690 addps(xmm11, xmm6);
2691 prefetcht0(byte[AA - 0x78]);
2692 sub(H, 0x1);
2693 jg(labels[52], T_NEAR);
2694 align(4);
2695
2696 L(labels[53]);
2697 prefetcht0(byte[CO1 + 0x0]);
2698 prefetcht0(byte[CO1 + LDC * 1 + 0x0]);
2699 add(H, 0x1e);
2700 align(4);
2701
2702 L(labels[54]);
2703 prefetcht0(byte[AO + 0x180]);
2704 prefetcht0(byte[BO + 0x100]);
2705 pshufd(xmm6, xmm4, 0xb1);
2706 mulps(xmm4, xmm0);
2707 addps(xmm8, xmm4);
2708 pshufd(xmm4, xmm6, 0x1b);
2709 mulps(xmm6, xmm0);
2710 addps(xmm9, xmm6);
2711 pshufd(xmm6, xmm4, 0xb1);
2712 mulps(xmm4, xmm0);
2713 addps(xmm10, xmm4);
2714 movddup(xmm4, qword[BO - 0x70]);
2715 mulps(xmm6, xmm0);
2716 movss(xmm0, dword[AO - 0x78]);
2717 addps(xmm11, xmm6);
2718 pshufd(xmm6, xmm5, 0xb1);
2719 mulps(xmm5, xmm2);
2720 addps(xmm8, xmm5);
2721 pshufd(xmm5, xmm6, 0x1b);
2722 mulps(xmm6, xmm2);
2723 addps(xmm9, xmm6);
2724 pshufd(xmm6, xmm5, 0xb1);
2725 mulps(xmm5, xmm2);
2726 addps(xmm10, xmm5);
2727 movddup(xmm5, qword[BO - 0x68]);
2728 mulps(xmm6, xmm2);
2729 movss(xmm2, dword[AO - 0x74]);
2730 addps(xmm11, xmm6);
2731 pshufd(xmm6, xmm4, 0xb1);
2732 mulps(xmm4, xmm0);
2733 addps(xmm8, xmm4);
2734 pshufd(xmm4, xmm6, 0x1b);
2735 mulps(xmm6, xmm0);
2736 addps(xmm9, xmm6);
2737 pshufd(xmm6, xmm4, 0xb1);
2738 mulps(xmm4, xmm0);
2739 addps(xmm10, xmm4);
2740 movddup(xmm4, qword[BO - 0x60]);
2741 mulps(xmm6, xmm0);
2742 movss(xmm0, dword[AO - 0x70]);
2743 addps(xmm11, xmm6);
2744 pshufd(xmm6, xmm5, 0xb1);
2745 mulps(xmm5, xmm2);
2746 addps(xmm8, xmm5);
2747 pshufd(xmm5, xmm6, 0x1b);
2748 mulps(xmm6, xmm2);
2749 addps(xmm9, xmm6);
2750 pshufd(xmm6, xmm5, 0xb1);
2751 mulps(xmm5, xmm2);
2752 addps(xmm10, xmm5);
2753 movddup(xmm5, qword[BO - 0x58]);
2754 add(AA, 0x8);
2755 sub(BO, -32);
2756 mulps(xmm6, xmm2);
2757 movss(xmm2, dword[AO - 0x6c]);
2758 sub(AO, -16);
2759 addps(xmm11, xmm6);
2760 prefetcht0(byte[AA - 0x78]);
2761 sub(H, 0x1);
2762 jg(labels[54], T_NEAR);
2763 align(4);
2764
2765 L(labels[55]);
2766 mov(H, K);
2767 and_(H, 0x3);
2768 je(labels[58], T_NEAR);
2769 align(4);
2770
2771 L(labels[56]);
2772 pshufd(xmm6, xmm4, 0xb1);
2773 movaps(xmm7, xmm4);
2774 mulps(xmm4, xmm0);
2775 addps(xmm8, xmm4);
2776 pshufd(xmm4, xmm6, 0x1b);
2777 movaps(xmm7, xmm6);
2778 mulps(xmm6, xmm0);
2779 addps(xmm9, xmm6);
2780 pshufd(xmm6, xmm4, 0xb1);
2781 movaps(xmm7, xmm4);
2782 mulps(xmm4, xmm0);
2783 addps(xmm10, xmm4);
2784 movddup(xmm4, qword[BO - 0x78]);
2785 movaps(xmm7, xmm6);
2786 mulps(xmm6, xmm0);
2787 movss(xmm0, dword[AO - 0x7c]);
2788 addps(xmm11, xmm6);
2789 sub(AO, -4);
2790 sub(BO, -8);
2791 dec(H);
2792 jg(labels[56], T_NEAR);
2793 align(4);
2794
2795 L(labels[58]);
2796 movaps(xmm0, xmm8);
2797 unpcklpd(xmm8, xmm9);
2798 unpckhpd(xmm0, xmm9);
2799 movaps(xmm1, xmm10);
2800 unpckhpd(xmm10, xmm11);
2801 unpcklpd(xmm1, xmm11);
2802 movaps(xmm9, xmm8);
2803 shufps(xmm8, xmm10, 0xcc);
2804 shufps(xmm9, xmm10, 0x66);
2805 movss(dword[CO1 + 0x0], xmm8);
2806 movss(dword[CO1 + LDC * 1 + 0x0], xmm9);
2807 lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
2808 lea(CO2, ptr[CO2 + LDC * 2]);
2809 align(4);
2810
2811 L(labels[59]);
2812 test(I, 0x1);
2813 jle(labels[67], T_NEAR);
2814 lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
2815 movss(xmm0, dword[A - 0x80]);
2816 xorps(xmm8, xmm8);
2817 xorps(xmm9, xmm9);
2818 movss(xmm2, dword[A - 0x7c]);
2819 xorps(xmm10, xmm10);
2820 xorps(xmm11, xmm11);
2821 movss(xmm4, dword[BO - 0x80]);
2822 xorps(xmm12, xmm12);
2823 movss(xmm5, dword[BO - 0x7c]);
2824 xorps(xmm13, xmm13);
2825 xorps(xmm14, xmm14);
2826 xorps(xmm15, xmm15);
2827 mov(AO, A);
2828 mov(H, K);
2829 sar(H, 0x2);
2830 jle(labels[64], T_NEAR);
2831 sub(H, 0x1e);
2832 jle(labels[62], T_NEAR);
2833 align(4);
2834
2835 L(labels[60]);
2836 prefetcht0(byte[AO + 0x180]);
2837 prefetcht0(byte[BO + 0x100]);
2838 pshufd(xmm6, xmm4, 0xb1);
2839 mulps(xmm4, xmm0);
2840 addps(xmm8, xmm4);
2841 pshufd(xmm4, xmm6, 0x1b);
2842 mulps(xmm6, xmm0);
2843 addps(xmm9, xmm6);
2844 pshufd(xmm6, xmm4, 0xb1);
2845 mulps(xmm4, xmm0);
2846 addps(xmm10, xmm4);
2847 movss(xmm4, dword[BO - 0x78]);
2848 mulps(xmm6, xmm0);
2849 movss(xmm0, dword[AO - 0x78]);
2850 addps(xmm11, xmm6);
2851 pshufd(xmm6, xmm5, 0xb1);
2852 mulps(xmm5, xmm2);
2853 addps(xmm8, xmm5);
2854 pshufd(xmm5, xmm6, 0x1b);
2855 mulps(xmm6, xmm2);
2856 addps(xmm9, xmm6);
2857 pshufd(xmm6, xmm5, 0xb1);
2858 mulps(xmm5, xmm2);
2859 addps(xmm10, xmm5);
2860 movss(xmm5, dword[BO - 0x74]);
2861 mulps(xmm6, xmm2);
2862 movss(xmm2, dword[AO - 0x74]);
2863 addps(xmm11, xmm6);
2864 pshufd(xmm6, xmm4, 0xb1);
2865 mulps(xmm4, xmm0);
2866 addps(xmm8, xmm4);
2867 pshufd(xmm4, xmm6, 0x1b);
2868 mulps(xmm6, xmm0);
2869 addps(xmm9, xmm6);
2870 pshufd(xmm6, xmm4, 0xb1);
2871 mulps(xmm4, xmm0);
2872 addps(xmm10, xmm4);
2873 movss(xmm4, dword[BO - 0x70]);
2874 mulps(xmm6, xmm0);
2875 movss(xmm0, dword[AO - 0x70]);
2876 addps(xmm11, xmm6);
2877 pshufd(xmm6, xmm5, 0xb1);
2878 mulps(xmm5, xmm2);
2879 addps(xmm8, xmm5);
2880 pshufd(xmm5, xmm6, 0x1b);
2881 mulps(xmm6, xmm2);
2882 addps(xmm9, xmm6);
2883 pshufd(xmm6, xmm5, 0xb1);
2884 mulps(xmm5, xmm2);
2885 addps(xmm10, xmm5);
2886 movss(xmm5, dword[BO - 0x6c]);
2887 add(AA, 0x8);
2888 sub(BO, -16);
2889 mulps(xmm6, xmm2);
2890 movss(xmm2, dword[AO - 0x6c]);
2891 sub(AO, -16);
2892 addps(xmm11, xmm6);
2893 prefetcht0(byte[AA - 0x78]);
2894 sub(H, 0x1);
2895 jg(labels[60], T_NEAR);
2896 align(4);
2897
2898 L(labels[62]);
2899 prefetcht0(byte[CO1 + 0x0]);
2900 add(H, 0x1e);
2901 align(4);
2902
2903 L(labels[63]);
2904 prefetcht0(byte[AO + 0x180]);
2905 prefetcht0(byte[BO + 0x100]);
2906 pshufd(xmm6, xmm4, 0xb1);
2907 mulps(xmm4, xmm0);
2908 addps(xmm8, xmm4);
2909 pshufd(xmm4, xmm6, 0x1b);
2910 mulps(xmm6, xmm0);
2911 addps(xmm9, xmm6);
2912 pshufd(xmm6, xmm4, 0xb1);
2913 mulps(xmm4, xmm0);
2914 addps(xmm10, xmm4);
2915 movss(xmm4, dword[BO - 0x78]);
2916 mulps(xmm6, xmm0);
2917 movss(xmm0, dword[AO - 0x78]);
2918 addps(xmm11, xmm6);
2919 pshufd(xmm6, xmm5, 0xb1);
2920 mulps(xmm5, xmm2);
2921 addps(xmm8, xmm5);
2922 pshufd(xmm5, xmm6, 0x1b);
2923 mulps(xmm6, xmm2);
2924 addps(xmm9, xmm6);
2925 pshufd(xmm6, xmm5, 0xb1);
2926 mulps(xmm5, xmm2);
2927 addps(xmm10, xmm5);
2928 movss(xmm5, dword[BO - 0x74]);
2929 mulps(xmm6, xmm2);
2930 movss(xmm2, dword[AO - 0x74]);
2931 addps(xmm11, xmm6);
2932 pshufd(xmm6, xmm4, 0xb1);
2933 mulps(xmm4, xmm0);
2934 addps(xmm8, xmm4);
2935 pshufd(xmm4, xmm6, 0x1b);
2936 mulps(xmm6, xmm0);
2937 addps(xmm9, xmm6);
2938 pshufd(xmm6, xmm4, 0xb1);
2939 mulps(xmm4, xmm0);
2940 addps(xmm10, xmm4);
2941 movss(xmm4, dword[BO - 0x70]);
2942 mulps(xmm6, xmm0);
2943 movss(xmm0, dword[AO - 0x70]);
2944 addps(xmm11, xmm6);
2945 pshufd(xmm6, xmm5, 0xb1);
2946 mulps(xmm5, xmm2);
2947 addps(xmm8, xmm5);
2948 pshufd(xmm5, xmm6, 0x1b);
2949 mulps(xmm6, xmm2);
2950 addps(xmm9, xmm6);
2951 pshufd(xmm6, xmm5, 0xb1);
2952 mulps(xmm5, xmm2);
2953 addps(xmm10, xmm5);
2954 movss(xmm5, dword[BO - 0x6c]);
2955 add(AA, 0x8);
2956 sub(BO, -16);
2957 mulps(xmm6, xmm2);
2958 movss(xmm2, dword[AO - 0x6c]);
2959 sub(AO, -16);
2960 addps(xmm11, xmm6);
2961 prefetcht0(byte[AA - 0x78]);
2962 sub(H, 0x1);
2963 jg(labels[63], T_NEAR);
2964 align(4);
2965
2966 L(labels[64]);
2967 mov(H, K);
2968 and_(H, 0x3);
2969 je(labels[66], T_NEAR);
2970 align(4);
2971
2972 L(labels[65]);
2973 pshufd(xmm6, xmm4, 0xb1);
2974 movaps(xmm7, xmm4);
2975 mulps(xmm4, xmm0);
2976 addps(xmm8, xmm4);
2977 pshufd(xmm4, xmm6, 0x1b);
2978 movaps(xmm7, xmm6);
2979 mulps(xmm6, xmm0);
2980 addps(xmm9, xmm6);
2981 pshufd(xmm6, xmm4, 0xb1);
2982 movaps(xmm7, xmm4);
2983 mulps(xmm4, xmm0);
2984 addps(xmm10, xmm4);
2985 movss(xmm4, dword[BO - 0x7c]);
2986 movaps(xmm7, xmm6);
2987 mulps(xmm6, xmm0);
2988 movss(xmm0, dword[AO - 0x7c]);
2989 addps(xmm11, xmm6);
2990 sub(AO, -4);
2991 sub(BO, -4);
2992 dec(H);
2993 jg(labels[65], T_NEAR);
2994 align(4);
2995
2996 L(labels[66]);
2997 movaps(xmm0, xmm8);
2998 unpcklpd(xmm8, xmm9);
2999 unpckhpd(xmm0, xmm9);
3000 movaps(xmm1, xmm10);
3001 unpckhpd(xmm10, xmm11);
3002 unpcklpd(xmm1, xmm11);
3003 movaps(xmm9, xmm8);
3004 shufps(xmm8, xmm10, 0xcc);
3005 shufps(xmm9, xmm10, 0x66);
3006 movss(dword[CO1 + 0x0], xmm8);
3007 lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
3008 lea(CO2, ptr[CO2 + LDC * 1]);
3009 align(4);
3010
3011 L(labels[67]);
3012 mov(A, AO);
3013 align(4);
3014
3015 L(labels[68]);
3016
3017 postamble();
3018 }
3019 outLocalLabel();
3020
3021 #undef M
3022 #undef N
3023 #undef K
3024 #undef A
3025 #undef B
3026 #undef C
3027 #undef LDC
3028 #undef AA
3029 #undef I
3030 #undef J
3031 #undef H
3032 #undef AO
3033 #undef BO
3034 #undef CO1
3035 #undef CO2
3036 #ifdef _WIN32
3037 #undef OLD_A
3038 #undef OLD_B
3039 #endif
3040 #undef OLD_C
3041 #undef OLD_LDC
3042 }
3043
3044 } // namespace x64
3045 } // namespace cpu
3046 } // namespace impl
3047 } // namespace dnnl
3048