1/***************************************************************************
2Copyright (c) 2013-2016, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*	 LAPACK-TEST		: OK
34**************************************************************************************/
35
36
37	srawi.		J,	N,	1
38	ble		ZGEMM_L2_END
39
40ZGEMM_L2_BEGIN:
41
42	mr		BO,	B
43	mr		BBO,	BBUFFER
44	srawi.		T1,	K,	2
45	ble		ZGEMM_L2_COPYB1
46
47ZGEMM_L2_COPYB8:
48
49	addi		T2,	PRE, 128
50	dcbt		BO,	PRE
51	dcbtst		BBO,	PRE
52	dcbtst		BBO,	T2
53	ZCOPYB_8x1
54	addic.		T1,	T1,	-1
55
56	bgt		ZGEMM_L2_COPYB8
57
58ZGEMM_L2_COPYB1:
59
60	andi.		T1,	K,	3
61	ble		ZGEMM_L2_COPYB_END
62
63ZGEMM_L2_COPYB_LOOP:
64
65	ZCOPYB_1x1
66	ZCOPYB_1x1
67	addic.          T1,     T1,     -1
68
69	bgt             ZGEMM_L2_COPYB_LOOP
70
71ZGEMM_L2_COPYB_END:
72
73	mr		CO,	C
74	mr		AO,	A
75	slwi		T1,	LDC	,	1
76	add		C,	C,	T1
77	srawi.		I,	M,	3
78	ble		ZGEMM_L2x8_END
79
80ZGEMM_L2x8_BEGIN:
81
82
83	mr		BO,	BBUFFER
84	srawi.		L,	K,	3
85	ble		ZGEMM_L2x8_SUB0
86	cmpwi		cr0,	L,	1
87	ble		ZGEMM_L2x8_SUB4
88
89ZGEMM_L2x8_LOOP_START:
90
91	dcbt		AO,	PRE
92	dcbt		BO,	PRE
93	LOAD2x8_1
94	dcbt		AO,	PRE
95	KERNEL2x8_I1
96	dcbt		AO,	PRE
97	dcbt		BO,	PRE
98	KERNEL2x8_2
99	dcbt		AO,	PRE
100	KERNEL2x8_1
101	dcbt		AO,	PRE
102	dcbt		BO,	PRE
103	KERNEL2x8_2
104
105	dcbt		AO,	PRE
106	KERNEL2x8_1
107	dcbt		AO,	PRE
108	dcbt		BO,	PRE
109	KERNEL2x8_2
110	dcbt		AO,	PRE
111	KERNEL2x8_1
112	dcbt		AO,	PRE
113	dcbt		BO,	PRE
114	KERNEL2x8_2
115
116	addic.		L,	L,	-2
117	ble		ZGEMM_L2x8_LOOP_END
118
119	.align 5
120
121ZGEMM_L2x8_LOOP:
122
123	dcbt		AO,	PRE
124	KERNEL2x8_1
125	dcbt		AO,	PRE
126	dcbt		BO,	PRE
127	KERNEL2x8_2
128	dcbt		AO,	PRE
129	KERNEL2x8_1
130	dcbt		AO,	PRE
131	dcbt		BO,	PRE
132	KERNEL2x8_2
133
134	dcbt		AO,	PRE
135	KERNEL2x8_1
136	dcbt		AO,	PRE
137	dcbt		BO,	PRE
138	KERNEL2x8_2
139	dcbt		AO,	PRE
140	KERNEL2x8_1
141	dcbt		AO,	PRE
142	dcbt		BO,	PRE
143	KERNEL2x8_2
144
145	addic.		L,	L,	-1
146	bgt		ZGEMM_L2x8_LOOP
147
148ZGEMM_L2x8_LOOP_END:
149
150	dcbt		AO,	PRE
151	KERNEL2x8_1
152	dcbt		AO,	PRE
153	dcbt		BO,	PRE
154	KERNEL2x8_2
155	dcbt		AO,	PRE
156	KERNEL2x8_1
157	dcbt		AO,	PRE
158	dcbt		BO,	PRE
159	KERNEL2x8_2
160
161	dcbt		AO,	PRE
162	KERNEL2x8_1
163	dcbt		AO,	PRE
164	KERNEL2x8_2
165	dcbt		AO,	PRE
166	KERNEL2x8_1
167	KERNEL2x8_E2
168
169	b		ZGEMM_L2x8_SUB1
170
171ZGEMM_L2x8_SUB4:
172
173	dcbt		AO,	PRE
174	KERNEL2x8_SUBI1
175	dcbt		AO,	PRE
176	KERNEL2x8_SUB1
177	dcbt		AO,	PRE
178	KERNEL2x8_SUB1
179	dcbt		AO,	PRE
180	KERNEL2x8_SUB1
181
182	KERNEL2x8_SUB1
183	KERNEL2x8_SUB1
184	KERNEL2x8_SUB1
185	KERNEL2x8_SUB1
186
187	b		ZGEMM_L2x8_SUB1
188
189ZGEMM_L2x8_SUB0:
190
191	andi.		L,	K,	7
192
193	KERNEL2x8_SUBI1
194
195	addic.		L,	L,	-1
196	ble		ZGEMM_L2x8_SAVE
197	b		ZGEMM_L2x8_SUB2
198
199ZGEMM_L2x8_SUB1:
200
201	andi.		L,	K,	7
202	ble		ZGEMM_L2x8_SAVE
203
204ZGEMM_L2x8_SUB2:
205
206	KERNEL2x8_SUB1
207
208	addic.		L,	L,	-1
209	bgt		ZGEMM_L2x8_SUB2
210
211ZGEMM_L2x8_SAVE:
212
213	SAVE2x8
214
215	addic.		I,	I,	-1
216	bgt		ZGEMM_L2x8_BEGIN
217
218ZGEMM_L2x8_END:
219
220ZGEMM_L2x4_BEGIN:
221
222	andi.		T2,	M,	7
223	ble		ZGEMM_L2x1_END
224
225	andi.		T1,	M,	4
226	ble		ZGEMM_L2x4_END
227	mr		BO,	BBUFFER
228	srawi.		L,	K,	3
229	ble		ZGEMM_L2x4_SUB0
230	cmpwi		cr0,	L,	1
231	ble		ZGEMM_L2x4_SUB4
232
233ZGEMM_L2x4_LOOP_START:
234
235	LOAD2x4_1
236	KERNEL2x4_I1
237	KERNEL2x4_2
238	KERNEL2x4_1
239	KERNEL2x4_2
240
241	KERNEL2x4_1
242	KERNEL2x4_2
243	KERNEL2x4_1
244	KERNEL2x4_2
245
246	addic.		L,	L,	-2
247	ble		ZGEMM_L2x4_LOOP_END
248
249	.align 5
250
251ZGEMM_L2x4_LOOP:
252
253	KERNEL2x4_1
254	KERNEL2x4_2
255	KERNEL2x4_1
256	KERNEL2x4_2
257
258	KERNEL2x4_1
259	KERNEL2x4_2
260	KERNEL2x4_1
261	KERNEL2x4_2
262
263	addic.		L,	L,	-1
264	bgt		ZGEMM_L2x4_LOOP
265
266ZGEMM_L2x4_LOOP_END:
267
268	KERNEL2x4_1
269	KERNEL2x4_2
270	KERNEL2x4_1
271	KERNEL2x4_2
272
273	KERNEL2x4_1
274	KERNEL2x4_2
275	KERNEL2x4_1
276	KERNEL2x4_E2
277
278	b		ZGEMM_L2x4_SUB1
279
280ZGEMM_L2x4_SUB4:
281
282	KERNEL2x4_SUBI1
283	KERNEL2x4_SUB1
284	KERNEL2x4_SUB1
285	KERNEL2x4_SUB1
286
287	KERNEL2x4_SUB1
288	KERNEL2x4_SUB1
289	KERNEL2x4_SUB1
290	KERNEL2x4_SUB1
291
292	b		ZGEMM_L2x4_SUB1
293
294ZGEMM_L2x4_SUB0:
295
296	andi.		L,	K,	7
297
298	KERNEL2x4_SUBI1
299
300	addic.		L,	L,	-1
301	ble		ZGEMM_L2x4_SAVE
302	b		ZGEMM_L2x4_SUB2
303
304ZGEMM_L2x4_SUB1:
305
306	andi.		L,	K,	7
307	ble		ZGEMM_L2x4_SAVE
308
309ZGEMM_L2x4_SUB2:
310
311	KERNEL2x4_SUB1
312
313	addic.		L,	L,	-1
314	bgt		ZGEMM_L2x4_SUB2
315
316ZGEMM_L2x4_SAVE:
317
318	SAVE2x4
319
320ZGEMM_L2x4_END:
321
322ZGEMM_L2x2_BEGIN:
323
324
325	andi.		T1,	M,	2
326	ble		ZGEMM_L2x2_END
327	mr		BO,	BBUFFER
328	srawi.		L,	K,	3
329	ble		ZGEMM_L2x2_SUB0
330	cmpwi		cr0,	L,	1
331	ble		ZGEMM_L2x2_SUB4
332
333ZGEMM_L2x2_LOOP_START:
334
335	LOAD2x2_1
336	KERNEL2x2_I1
337	KERNEL2x2_2
338	KERNEL2x2_1
339	KERNEL2x2_2
340
341	KERNEL2x2_1
342	KERNEL2x2_2
343	KERNEL2x2_1
344	KERNEL2x2_2
345
346	addic.		L,	L,	-2
347	ble		ZGEMM_L2x2_LOOP_END
348
349	.align 5
350
351ZGEMM_L2x2_LOOP:
352
353	KERNEL2x2_1
354	KERNEL2x2_2
355	KERNEL2x2_1
356	KERNEL2x2_2
357
358	KERNEL2x2_1
359	KERNEL2x2_2
360	KERNEL2x2_1
361	KERNEL2x2_2
362
363	addic.		L,	L,	-1
364	bgt		ZGEMM_L2x2_LOOP
365
366ZGEMM_L2x2_LOOP_END:
367
368	KERNEL2x2_1
369	KERNEL2x2_2
370	KERNEL2x2_1
371	KERNEL2x2_2
372
373	KERNEL2x2_1
374	KERNEL2x2_2
375	KERNEL2x2_1
376	KERNEL2x2_E2
377
378	b		ZGEMM_L2x2_SUB1
379
380ZGEMM_L2x2_SUB4:
381
382	KERNEL2x2_SUBI1
383	KERNEL2x2_SUB1
384	KERNEL2x2_SUB1
385	KERNEL2x2_SUB1
386
387	KERNEL2x2_SUB1
388	KERNEL2x2_SUB1
389	KERNEL2x2_SUB1
390	KERNEL2x2_SUB1
391
392	b		ZGEMM_L2x2_SUB1
393
394ZGEMM_L2x2_SUB0:
395
396	andi.		L,	K,	7
397
398	KERNEL2x2_SUBI1
399
400	addic.		L,	L,	-1
401	ble		ZGEMM_L2x2_SAVE
402	b		ZGEMM_L2x2_SUB2
403
404ZGEMM_L2x2_SUB1:
405
406	andi.		L,	K,	7
407	ble		ZGEMM_L2x2_SAVE
408
409ZGEMM_L2x2_SUB2:
410
411	KERNEL2x2_SUB1
412
413	addic.		L,	L,	-1
414	bgt		ZGEMM_L2x2_SUB2
415
416ZGEMM_L2x2_SAVE:
417
418	SAVE2x2
419
420ZGEMM_L2x2_END:
421
422ZGEMM_L2x1_BEGIN:
423
424
425	andi.		T1,	M,	1
426	ble		ZGEMM_L2x1_END
427	mr		BO,	BBUFFER
428	srawi.		L,	K,	3
429	ble		ZGEMM_L2x1_SUB0
430	cmpwi		cr0,	L,	1
431	ble		ZGEMM_L2x1_SUB4
432
433ZGEMM_L2x1_LOOP_START:
434
435	LOAD2x1_1
436	KERNEL2x1_I1
437	KERNEL2x1_2
438	KERNEL2x1_1
439	KERNEL2x1_2
440
441	KERNEL2x1_1
442	KERNEL2x1_2
443	KERNEL2x1_1
444	KERNEL2x1_2
445
446	addic.		L,	L,	-2
447	ble		ZGEMM_L2x1_LOOP_END
448
449	.align 5
450
451ZGEMM_L2x1_LOOP:
452
453	KERNEL2x1_1
454	KERNEL2x1_2
455	KERNEL2x1_1
456	KERNEL2x1_2
457
458	KERNEL2x1_1
459	KERNEL2x1_2
460	KERNEL2x1_1
461	KERNEL2x1_2
462
463	addic.		L,	L,	-1
464	bgt		ZGEMM_L2x1_LOOP
465
466ZGEMM_L2x1_LOOP_END:
467
468	KERNEL2x1_1
469	KERNEL2x1_2
470	KERNEL2x1_1
471	KERNEL2x1_2
472
473	KERNEL2x1_1
474	KERNEL2x1_2
475	KERNEL2x1_1
476	KERNEL2x1_E2
477
478	b		ZGEMM_L2x1_SUB1
479
480ZGEMM_L2x1_SUB4:
481
482	KERNEL2x1_SUBI1
483	KERNEL2x1_SUB1
484	KERNEL2x1_SUB1
485	KERNEL2x1_SUB1
486
487	KERNEL2x1_SUB1
488	KERNEL2x1_SUB1
489	KERNEL2x1_SUB1
490	KERNEL2x1_SUB1
491
492	b		ZGEMM_L2x1_SUB1
493
494ZGEMM_L2x1_SUB0:
495
496	andi.		L,	K,	7
497
498	KERNEL2x1_SUBI1
499
500	addic.		L,	L,	-1
501	ble		ZGEMM_L2x1_SAVE
502	b		ZGEMM_L2x1_SUB2
503
504ZGEMM_L2x1_SUB1:
505
506	andi.		L,	K,	7
507	ble		ZGEMM_L2x1_SAVE
508
509ZGEMM_L2x1_SUB2:
510
511	KERNEL2x1_SUB1
512
513	addic.		L,	L,	-1
514	bgt		ZGEMM_L2x1_SUB2
515
516ZGEMM_L2x1_SAVE:
517
518	SAVE2x1
519
520ZGEMM_L2x1_END:
521
522	slwi		T1,	K,	5
523	add		B,	B,	T1
524
525	addic.		J,	J,	-1
526	bgt		ZGEMM_L2_BEGIN
527
528	andi.		T2,	N,	1
529	ble		L999
530
531ZGEMM_L2_END:
532
533	b		ZGEMM_L1_BEGIN
534
535L999_H1:
536
537	b		L999
538
539ZGEMM_L1_BEGIN:
540
541	mr		BO,	B
542	mr		BBO,	BBUFFER
543	slwi		T1,	K,	0
544
545ZGEMM_L1_COPYB:
546	dcbtst		BBO,	PRE
547
548	lxvdsx		vs4,	o0,	BO              // b0_r
549	lxvdsx		vs5,	o8,	BO              // b0_i
550	addi		BO,	BO,	16
551	stxvd2x		vs4,	o0,	BBO
552	stxvd2x		vs5,	o16,	BBO
553	addic.		T1,	T1,	-1
554	addi		BBO,	BBO,	32
555
556	bge		ZGEMM_L1_COPYB
557
558
559	andi.		T1,	N,	1
560	ble		ZGEMM_L1_END
561	mr		CO,	C
562	mr		AO,	A
563	srawi.		I,	M,	3
564	ble		ZGEMM_L1x8_END
565
566ZGEMM_L1x8_BEGIN:
567
568
569	mr		BO,	BBUFFER
570	srawi.		L,	K,	3
571	ble		ZGEMM_L1x8_SUB0
572	cmpwi		cr0,	L,	1
573	ble		ZGEMM_L1x8_SUB4
574
575ZGEMM_L1x8_LOOP_START:
576
577	dcbt		AO,	PRE
578	LOAD1x8_1
579	dcbt		AO,	PRE
580	KERNEL1x8_I1
581	dcbt		AO,	PRE
582	KERNEL1x8_2
583	dcbt		AO,	PRE
584	KERNEL1x8_1
585	dcbt		AO,	PRE
586	KERNEL1x8_2
587
588	dcbt		AO,	PRE
589	KERNEL1x8_1
590	dcbt		AO,	PRE
591	KERNEL1x8_2
592	dcbt		AO,	PRE
593	KERNEL1x8_1
594	dcbt		AO,	PRE
595	KERNEL1x8_2
596
597	addic.		L,	L,	-2
598	ble		ZGEMM_L1x8_LOOP_END
599
600	.align 5
601
602ZGEMM_L1x8_LOOP:
603
604	dcbt		AO,	PRE
605	KERNEL1x8_1
606	dcbt		AO,	PRE
607	KERNEL1x8_2
608	dcbt		AO,	PRE
609	KERNEL1x8_1
610	dcbt		AO,	PRE
611	KERNEL1x8_2
612
613	dcbt		AO,	PRE
614	KERNEL1x8_1
615	dcbt		AO,	PRE
616	KERNEL1x8_2
617	dcbt		AO,	PRE
618	KERNEL1x8_1
619	dcbt		AO,	PRE
620	KERNEL1x8_2
621
622	addic.		L,	L,	-1
623	bgt		ZGEMM_L1x8_LOOP
624
625ZGEMM_L1x8_LOOP_END:
626
627	dcbt		AO,	PRE
628	KERNEL1x8_1
629	dcbt		AO,	PRE
630	KERNEL1x8_2
631	dcbt		AO,	PRE
632	KERNEL1x8_1
633	dcbt		AO,	PRE
634	KERNEL1x8_2
635
636	dcbt		AO,	PRE
637	KERNEL1x8_1
638	dcbt		AO,	PRE
639	KERNEL1x8_2
640	dcbt		AO,	PRE
641	KERNEL1x8_1
642	KERNEL1x8_E2
643
644	b		ZGEMM_L1x8_SUB1
645
646ZGEMM_L1x8_SUB4:
647
648	dcbt		AO,	PRE
649	KERNEL1x8_SUBI1
650	dcbt		AO,	PRE
651	KERNEL1x8_SUB1
652	dcbt		AO,	PRE
653	KERNEL1x8_SUB1
654	dcbt		AO,	PRE
655	KERNEL1x8_SUB1
656
657	KERNEL1x8_SUB1
658	KERNEL1x8_SUB1
659	KERNEL1x8_SUB1
660	KERNEL1x8_SUB1
661
662	b		ZGEMM_L1x8_SUB1
663
664ZGEMM_L1x8_SUB0:
665
666	andi.		L,	K,	7
667
668	KERNEL1x8_SUBI1
669
670	addic.		L,	L,	-1
671	ble		ZGEMM_L1x8_SAVE
672	b		ZGEMM_L1x8_SUB2
673
674ZGEMM_L1x8_SUB1:
675
676	andi.		L,	K,	7
677	ble		ZGEMM_L1x8_SAVE
678
679ZGEMM_L1x8_SUB2:
680
681	KERNEL1x8_SUB1
682
683	addic.		L,	L,	-1
684	bgt		ZGEMM_L1x8_SUB2
685
686ZGEMM_L1x8_SAVE:
687
688	SAVE1x8
689
690	addic.		I,	I,	-1
691	bgt		ZGEMM_L1x8_BEGIN
692
693ZGEMM_L1x8_END:
694
695ZGEMM_L1x4_BEGIN:
696
697	andi.		T2,	M,	7
698	ble		ZGEMM_L1x1_END
699
700	andi.		T1,	M,	4
701	ble		ZGEMM_L1x4_END
702	mr		BO,	BBUFFER
703	srawi.		L,	K,	3
704	ble		ZGEMM_L1x4_SUB0
705	cmpwi		cr0,	L,	1
706	ble		ZGEMM_L1x4_SUB4
707
708ZGEMM_L1x4_LOOP_START:
709
710	LOAD1x4_1
711	KERNEL1x4_I1
712	KERNEL1x4_2
713	KERNEL1x4_1
714	KERNEL1x4_2
715
716	KERNEL1x4_1
717	KERNEL1x4_2
718	KERNEL1x4_1
719	KERNEL1x4_2
720
721	addic.		L,	L,	-2
722	ble		ZGEMM_L1x4_LOOP_END
723
724	.align 5
725
726ZGEMM_L1x4_LOOP:
727
728	KERNEL1x4_1
729	KERNEL1x4_2
730	KERNEL1x4_1
731	KERNEL1x4_2
732
733	KERNEL1x4_1
734	KERNEL1x4_2
735	KERNEL1x4_1
736	KERNEL1x4_2
737
738	addic.		L,	L,	-1
739	bgt		ZGEMM_L1x4_LOOP
740
741ZGEMM_L1x4_LOOP_END:
742
743	KERNEL1x4_1
744	KERNEL1x4_2
745	KERNEL1x4_1
746	KERNEL1x4_2
747
748	KERNEL1x4_1
749	KERNEL1x4_2
750	KERNEL1x4_1
751	KERNEL1x4_E2
752
753	b		ZGEMM_L1x4_SUB1
754
755ZGEMM_L1x4_SUB4:
756
757	KERNEL1x4_SUBI1
758	KERNEL1x4_SUB1
759	KERNEL1x4_SUB1
760	KERNEL1x4_SUB1
761
762	KERNEL1x4_SUB1
763	KERNEL1x4_SUB1
764	KERNEL1x4_SUB1
765	KERNEL1x4_SUB1
766
767	b		ZGEMM_L1x4_SUB1
768
769ZGEMM_L1x4_SUB0:
770
771	andi.		L,	K,	7
772
773	KERNEL1x4_SUBI1
774
775	addic.		L,	L,	-1
776	ble		ZGEMM_L1x4_SAVE
777	b		ZGEMM_L1x4_SUB2
778
779ZGEMM_L1x4_SUB1:
780
781	andi.		L,	K,	7
782	ble		ZGEMM_L1x4_SAVE
783
784ZGEMM_L1x4_SUB2:
785
786	KERNEL1x4_SUB1
787
788	addic.		L,	L,	-1
789	bgt		ZGEMM_L1x4_SUB2
790
791ZGEMM_L1x4_SAVE:
792
793	SAVE1x4
794
795ZGEMM_L1x4_END:
796
797ZGEMM_L1x2_BEGIN:
798
799
800	andi.		T1,	M,	2
801	ble		ZGEMM_L1x2_END
802	mr		BO,	BBUFFER
803	srawi.		L,	K,	3
804	ble		ZGEMM_L1x2_SUB0
805	cmpwi		cr0,	L,	1
806	ble		ZGEMM_L1x2_SUB4
807
808ZGEMM_L1x2_LOOP_START:
809
810	LOAD1x2_1
811	KERNEL1x2_I1
812	KERNEL1x2_2
813	KERNEL1x2_1
814	KERNEL1x2_2
815
816	KERNEL1x2_1
817	KERNEL1x2_2
818	KERNEL1x2_1
819	KERNEL1x2_2
820
821	addic.		L,	L,	-2
822	ble		ZGEMM_L1x2_LOOP_END
823
824	.align 5
825
826ZGEMM_L1x2_LOOP:
827
828	KERNEL1x2_1
829	KERNEL1x2_2
830	KERNEL1x2_1
831	KERNEL1x2_2
832
833	KERNEL1x2_1
834	KERNEL1x2_2
835	KERNEL1x2_1
836	KERNEL1x2_2
837
838	addic.		L,	L,	-1
839	bgt		ZGEMM_L1x2_LOOP
840
841ZGEMM_L1x2_LOOP_END:
842
843	KERNEL1x2_1
844	KERNEL1x2_2
845	KERNEL1x2_1
846	KERNEL1x2_2
847
848	KERNEL1x2_1
849	KERNEL1x2_2
850	KERNEL1x2_1
851	KERNEL1x2_E2
852
853	b		ZGEMM_L1x2_SUB1
854
855ZGEMM_L1x2_SUB4:
856
857	KERNEL1x2_SUBI1
858	KERNEL1x2_SUB1
859	KERNEL1x2_SUB1
860	KERNEL1x2_SUB1
861
862	KERNEL1x2_SUB1
863	KERNEL1x2_SUB1
864	KERNEL1x2_SUB1
865	KERNEL1x2_SUB1
866
867	b		ZGEMM_L1x2_SUB1
868
869ZGEMM_L1x2_SUB0:
870
871	andi.		L,	K,	7
872
873	KERNEL1x2_SUBI1
874
875	addic.		L,	L,	-1
876	ble		ZGEMM_L1x2_SAVE
877	b		ZGEMM_L1x2_SUB2
878
879ZGEMM_L1x2_SUB1:
880
881	andi.		L,	K,	7
882	ble		ZGEMM_L1x2_SAVE
883
884ZGEMM_L1x2_SUB2:
885
886	KERNEL1x2_SUB1
887
888	addic.		L,	L,	-1
889	bgt		ZGEMM_L1x2_SUB2
890
891ZGEMM_L1x2_SAVE:
892
893	SAVE1x2
894
895ZGEMM_L1x2_END:
896
897ZGEMM_L1x1_BEGIN:
898
899
900	andi.		T1,	M,	1
901	ble		ZGEMM_L1x1_END
902	mr		BO,	BBUFFER
903	srawi.		L,	K,	3
904	ble		ZGEMM_L1x1_SUB0
905	cmpwi		cr0,	L,	1
906	ble		ZGEMM_L1x1_SUB4
907
908ZGEMM_L1x1_LOOP_START:
909
910	LOAD1x1_1
911	KERNEL1x1_I1
912	KERNEL1x1_2
913	KERNEL1x1_1
914	KERNEL1x1_2
915
916	KERNEL1x1_1
917	KERNEL1x1_2
918	KERNEL1x1_1
919	KERNEL1x1_2
920
921	addic.		L,	L,	-2
922	ble		ZGEMM_L1x1_LOOP_END
923
924	.align 5
925
926ZGEMM_L1x1_LOOP:
927
928	KERNEL1x1_1
929	KERNEL1x1_2
930	KERNEL1x1_1
931	KERNEL1x1_2
932
933	KERNEL1x1_1
934	KERNEL1x1_2
935	KERNEL1x1_1
936	KERNEL1x1_2
937
938	addic.		L,	L,	-1
939	bgt		ZGEMM_L1x1_LOOP
940
941ZGEMM_L1x1_LOOP_END:
942
943	KERNEL1x1_1
944	KERNEL1x1_2
945	KERNEL1x1_1
946	KERNEL1x1_2
947
948	KERNEL1x1_1
949	KERNEL1x1_2
950	KERNEL1x1_1
951	KERNEL1x1_E2
952
953	b		ZGEMM_L1x1_SUB1
954
955ZGEMM_L1x1_SUB4:
956
957	KERNEL1x1_SUBI1
958	KERNEL1x1_SUB1
959	KERNEL1x1_SUB1
960	KERNEL1x1_SUB1
961
962	KERNEL1x1_SUB1
963	KERNEL1x1_SUB1
964	KERNEL1x1_SUB1
965	KERNEL1x1_SUB1
966
967	b		ZGEMM_L1x1_SUB1
968
969ZGEMM_L1x1_SUB0:
970
971	andi.		L,	K,	7
972
973	KERNEL1x1_SUBI1
974
975	addic.		L,	L,	-1
976	ble		ZGEMM_L1x1_SAVE
977	b		ZGEMM_L1x1_SUB2
978
979ZGEMM_L1x1_SUB1:
980
981	andi.		L,	K,	7
982	ble		ZGEMM_L1x1_SAVE
983
984ZGEMM_L1x1_SUB2:
985
986	KERNEL1x1_SUB1
987
988	addic.		L,	L,	-1
989	bgt		ZGEMM_L1x1_SUB2
990
991ZGEMM_L1x1_SAVE:
992
993	SAVE1x1
994
995ZGEMM_L1x1_END:
996
997ZGEMM_L1_END:
998