1/**************************************************************************************************
2*                                                                                                 *
3* This file is part of BLASFEO.                                                                   *
4*                                                                                                 *
5* BLASFEO -- BLAS For Embedded Optimization.                                                      *
6* Copyright (C) 2019 by Gianluca Frison.                                                          *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8* All rights reserved.                                                                            *
9*                                                                                                 *
10* The 2-Clause BSD License                                                                        *
11*                                                                                                 *
12* Redistribution and use in source and binary forms, with or without                              *
13* modification, are permitted provided that the following conditions are met:                     *
14*                                                                                                 *
15* 1. Redistributions of source code must retain the above copyright notice, this                  *
16*    list of conditions and the following disclaimer.                                             *
17* 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18*    this list of conditions and the following disclaimer in the documentation                    *
19*    and/or other materials provided with the distribution.                                       *
20*                                                                                                 *
21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31*                                                                                                 *
32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33*                                                                                                 *
34**************************************************************************************************/
35
36//	// prologue
37//	stmdb	sp!, {r4 - r10, fp, lr} // save GP registers
38//	add		fp, sp, #36 // fp to old sp position
39//	fstmfdd	sp!, {d8-d15} // save FP registers
40#define PROLOGUE \
41	stmdb	sp!, {r4 - r10, fp, lr}; \
42	add		fp, sp, #36; \
43	fstmfdd	sp!, {d8-d15};
44//	// epilogue
45//	fldmfdd	sp!, {d8-d15} // load FP registers
46//	ldmia	sp!, {r4 - r10, fp, pc} // load GP registers and return
47#define EPILOGUE \
48	fldmfdd	sp!, {d8-d15}; \
49	ldmia	sp!, {r4 - r10, fp, pc};
50
51
52
53#if defined(OS_LINUX)
54	.text
55#elif defined(OS_MAC)
56	.section	__TEXT,__text,regular,pure_instructions
57#endif
58
59
60
61// subroutine
62//
63// input arguments:
64// r4   <- k
65// r5   <- A
66// r6   <- B
67//
68// output arguments:
69
70#if MACRO_LEVEL>=2
71	.macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
72#else
73//	.p2align 4,,15
74#if defined(OS_LINUX)
75	.type inner_kernel_gemm_add_nt_4x4_lib4, %function
76inner_kernel_gemm_add_nt_4x4_lib4:
77#elif defined(OS_MAC)
78_inner_kernel_gemm_add_nt_4x4_lib4:
79#endif
80#endif
81
82	// early return
83	cmp		r4, #0
84	ble		2f // return
85
86	// prefetch
87	pld		[r5, #0]
88	pld		[r6, #0]
89#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
90#if defined(TARGET_ARMV7A_ARM_CORTEX_A9)
91	pld		[r5, #32]
92	pld		[r6, #32]
93#endif
94	pld		[r5, #64]
95	pld		[r6, #64]
96#if defined(TARGET_ARMV7A_ARM_CORTEX_A9)
97	pld		[r5, #96]
98	pld		[r6, #96]
99#endif
100	pld		[r5, #64]
101#else // cortex a15
102	// preload
103	vld1.64		{d0, d1}, [r5:128]! // A
104	vld1.64		{d4, d5}, [r6:128]! // B
105#endif
106
107	cmp		r4, #4
108	ble		0f // consider clean up loop
109
110	// main loop
1111:
112
113#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
114
115	vld1.64		{d0, d1}, [r6:128]! // B
116	vld1.64		{d16, d17}, [r5:128]! // A
117
118	vld1.64		{d2, d3}, [r6:128]! // B
119	vld1.64		{d18, d19}, [r5:128]! // A
120
121	vld1.64		{d4, d5}, [r6:128]! // B
122	vld1.64		{d20, d21}, [r5:128]! // A
123
124	vld1.64		{d6, d7}, [r6:128]! // B
125	vld1.64		{d22, d23}, [r5:128]! // A
126
127	// prefetch
128
129	// unroll 0
130	vmla.f32	q4, q8, d0[0]
131	pld		[r6, #64]
132	vmla.f32	q5, q8, d0[1]
133	pld		[r5, #64]
134	vmla.f32	q6, q8, d1[0]
135#if defined(TARGET_ARMV7A_ARM_CORTEX_A9)
136	pld		[r6, #96]
137#endif
138	vmla.f32	q7, q8, d1[1]
139#if defined(TARGET_ARMV7A_ARM_CORTEX_A9)
140	pld		[r5, #96]
141#endif
142
143	// unroll 1
144	vmla.f32	q4, q9, d2[0]
145	vmla.f32	q5, q9, d2[1]
146	vmla.f32	q6, q9, d3[0]
147	vmla.f32	q7, q9, d3[1]
148
149	// unroll 2
150	vmla.f32	q4, q10, d4[0]
151	vmla.f32	q5, q10, d4[1]
152	vmla.f32	q6, q10, d5[0]
153	vmla.f32	q7, q10, d5[1]
154
155	// unroll 3
156	vmla.f32	q4, q11, d6[0]
157	vmla.f32	q5, q11, d6[1]
158	vmla.f32	q6, q11, d7[0]
159	vmla.f32	q7, q11, d7[1]
160
161	sub		r4, r4, #4
162
163#else // cortex a15
164
165	// prefetch
166	pld		[r5, #64]
167	pld		[r6, #64]
168
169	// unroll 0
170	vmla.f32	q4, q0, d4[0]
171	vld1.64		{d2, d3}, [r5:128]! // A
172	vmla.f32	q5, q0, d4[1]
173	vld1.64		{d6, d7}, [r6:128]! // B
174	vmla.f32	q6, q0, d5[0]
175	vmla.f32	q7, q0, d5[1]
176
177	// unroll 1
178	vmla.f32	q4, q1, d6[0]
179	vld1.64		{d0, d1}, [r5:128]! // A
180	vmla.f32	q5, q1, d6[1]
181	vld1.64		{d4, d5}, [r6:128]! // B
182	vmla.f32	q6, q1, d7[0]
183	vmla.f32	q7, q1, d7[1]
184
185	// unroll 2
186	vmla.f32	q4, q0, d4[0]
187	vld1.64		{d2, d3}, [r5:128]! // A
188	vmla.f32	q5, q0, d4[1]
189	vld1.64		{d6, d7}, [r6:128]! // B
190	vmla.f32	q6, q0, d5[0]
191	vmla.f32	q7, q0, d5[1]
192
193	// unroll 3
194	vmla.f32	q4, q1, d6[0]
195	vld1.64		{d0, d1}, [r5:128]! // A
196	vmla.f32	q5, q1, d6[1]
197	vld1.64		{d4, d5}, [r6:128]! // B
198	vmla.f32	q6, q1, d7[0]
199	vmla.f32	q7, q1, d7[1]
200
201	sub		r4, r4, #4
202
203#endif
204
205	cmp		r4, #4
206	bgt		1b
207
2080:
209
210	cmp		r4, #3
211	ble		4f
212
213#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
214
215	vld1.64		{d0, d1}, [r6:128]! // B
216	vld1.64		{d16, d17}, [r5:128]! // A
217
218	vld1.64		{d2, d3}, [r6:128]! // B
219	vld1.64		{d18, d19}, [r5:128]! // A
220
221	vld1.64		{d4, d5}, [r6:128]! // B
222	vld1.64		{d20, d21}, [r5:128]! // A
223
224	vld1.64		{d6, d7}, [r6:128]! // B
225	vld1.64		{d22, d23}, [r5:128]! // A
226
227	// prefetch
228
229	// unroll 0
230	vmla.f32	q4, q8, d0[0]
231//	pld		[r5, #64]
232	vmla.f32	q5, q8, d0[1]
233//	pld		[r6, #64]
234	vmla.f32	q6, q8, d1[0]
235	vmla.f32	q7, q8, d1[1]
236
237	// unroll 1
238	vmla.f32	q4, q9, d2[0]
239	vmla.f32	q5, q9, d2[1]
240	vmla.f32	q6, q9, d3[0]
241	vmla.f32	q7, q9, d3[1]
242
243	// unroll 2
244	vmla.f32	q4, q10, d4[0]
245	vmla.f32	q5, q10, d4[1]
246	vmla.f32	q6, q10, d5[0]
247	vmla.f32	q7, q10, d5[1]
248
249	// unroll 3
250	vmla.f32	q4, q11, d6[0]
251	vmla.f32	q5, q11, d6[1]
252	vmla.f32	q6, q11, d7[0]
253	vmla.f32	q7, q11, d7[1]
254
255	sub		r4, r4, #4
256
257#else // cortex a15
258
259	// unroll 0
260	vmla.f32	q4, q0, d4[0]
261	vld1.64		{d2, d3}, [r5:128]! // A
262	vmla.f32	q5, q0, d4[1]
263	vld1.64		{d6, d7}, [r6:128]! // B
264	vmla.f32	q6, q0, d5[0]
265	vmla.f32	q7, q0, d5[1]
266
267	// unroll 1
268	vmla.f32	q4, q1, d6[0]
269	vld1.64		{d0, d1}, [r5:128]! // A
270	vmla.f32	q5, q1, d6[1]
271	vld1.64		{d4, d5}, [r6:128]! // B
272	vmla.f32	q6, q1, d7[0]
273	vmla.f32	q7, q1, d7[1]
274
275	// unroll 2
276	vmla.f32	q4, q0, d4[0]
277	vld1.64		{d2, d3}, [r5:128]! // A
278	vmla.f32	q5, q0, d4[1]
279	vld1.64		{d6, d7}, [r6:128]! // B
280	vmla.f32	q6, q0, d5[0]
281	vmla.f32	q7, q0, d5[1]
282
283	// unroll 3
284	vmla.f32	q4, q1, d6[0]
285//	vld1.64		{d0, d1}, [r5:128]! // A
286	vmla.f32	q5, q1, d6[1]
287//	vld1.64		{d4, d5}, [r6:128]! // B
288	vmla.f32	q6, q1, d7[0]
289	vmla.f32	q7, q1, d7[1]
290
291	sub		r4, r4, #4
292
293#endif
294
295	b		2f // return
296
2974: // consider clean1-up loop
298
299	cmp		r4, #0
300	ble		2f // return
301
302#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
303
304#else // cortex a15
305	sub		r5, r5, #16
306	sub		r6, r6, #16
307#endif
308
3093: // clean1-up loop
310
311	// unroll 0
312	vld1.64		{d0, d1}, [r5:128]! // A
313	vld1.64		{d4, d5}, [r6:128]! // B
314
315	vmla.f32	q4, q0, d4[0]
316	vmla.f32	q5, q0, d4[1]
317	vmla.f32	q6, q0, d5[0]
318	vmla.f32	q7, q0, d5[1]
319
320	sub		r4, r4, #1
321	cmp		r4, #0
322	bgt		3b
323
3242: // return
325
326
327#if MACRO_LEVEL>=2
328	.endm
329#else
330	mov		pc, lr // return
331
332#if defined(OS_LINUX)
333	.size	inner_kernel_gemm_add_nt_4x4_lib4, .-inner_kernel_gemm_add_nt_4x4_lib4
334#endif
335#endif
336
337
338
339
340
341// subroutine
342//
343// input arguments:
344// r4   <- k
345// r5   <- A
346// r6   <- B
347// r7   <- 4*sdb*sizeof(float)
348//
349// output arguments:
350
351#if MACRO_LEVEL>=2
352	.macro INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
353#else
354//	.p2align 4,,15
355#if defined(OS_LINUX)
356	.type inner_kernel_gemm_add_nn_4x4_lib4, %function
357inner_kernel_gemm_add_nn_4x4_lib4:
358#elif defined(OS_MAC)
359_inner_kernel_gemm_add_nn_4x4_lib4:
360#endif
361#endif
362
363	// early return
364	cmp		r4, #0
365	ble		2f // return
366
367	// prefetch
368	pld		[r5, #0]
369	pld		[r6, #0]
370#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
371#if defined(TARGET_ARMV7A_ARM_CORTEX_A9)
372	add		r8, r7, r7
373	pld		[r5, #32]
374	add		r9, r7, #32
375	pld		[r6, #32]
376#endif
377	pld		[r5, #64]
378	pld		[r6, r7]
379#if defined(TARGET_ARMV7A_ARM_CORTEX_A9)
380	pld		[r5, #96]
381	pld		[r6, r9]
382	add		r9, r9, r7
383#endif
384	pld		[r5, #64]
385#else // cortex a15
386	// preload
387	vld1.64		{d0, d1}, [r5:128]! // A
388	vldr		d4, [r6, #0]   // B[0,1]
389	vldr		d5, [r6, #16]  // B[4,5]
390	vldr		d6, [r6, #32]  // B[8,9]
391	vldr		d7, [r6, #48]  // B[12,13]
392#endif
393
394	cmp		r4, #4
395	ble		0f // consider clean up loop
396
397	// main loop
3981:
399
400#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
401
402	// prefetch
403
404	vld1.64		{d0, d1, d2, d3}, [r6:128]! // B
405	vld1.64		{d16, d17, d18, d19}, [r5:128]! // A
406
407	vld1.64		{d4, d5, d6, d7}, [r6:128]! // B
408	vld1.64		{d20, d21, d22, d23}, [r5:128]! // A
409
410	sub		r6, r6, #64
411
412	// unroll 0
413	vmla.f32	q4, q8, d0[0]
414	pld		[r6, r8]
415	vmla.f32	q5, q8, d2[0]
416	pld		[r5, #64]
417	vmla.f32	q6, q8, d4[0]
418#if defined(TARGET_ARMV7A_ARM_CORTEX_A9)
419	pld		[r6, r9]
420#endif
421	vmla.f32	q7, q8, d6[0]
422#if defined(TARGET_ARMV7A_ARM_CORTEX_A9)
423	pld		[r5, #96]
424#endif
425
426	// unroll 1
427	vmla.f32	q4, q9, d0[1]
428	vmla.f32	q5, q9, d2[1]
429	vmla.f32	q6, q9, d4[1]
430	vmla.f32	q7, q9, d6[1]
431
432	// unroll 2
433	vmla.f32	q4, q10, d1[0]
434	vmla.f32	q5, q10, d3[0]
435	vmla.f32	q6, q10, d5[0]
436	vmla.f32	q7, q10, d7[0]
437
438	// unroll 3
439	vmla.f32	q4, q11, d1[1]
440	vmla.f32	q5, q11, d3[1]
441	vmla.f32	q6, q11, d5[1]
442	vmla.f32	q7, q11, d7[1]
443
444	add		r6, r6, r7
445	sub		r4, r4, #4
446
447#else // cortex a15
448
449	// prefetch
450	pld		[r5, #64]
451	pld		[r6, r7]
452
453	// unroll 0
454	vmla.f32	q4, q0, d4[0]
455	vld1.64		{d2, d3}, [r5:128]! // A
456	vmla.f32	q5, q0, d5[0]
457	vmla.f32	q6, q0, d6[0]
458	vmla.f32	q7, q0, d7[0]
459
460	// unroll 1
461	vld1.64		{d0, d1}, [r5:128]! // A
462	vmla.f32	q4, q1, d4[1]
463	vldr		d4, [r6, #8]  // B[2,3]
464	vmla.f32	q5, q1, d5[1]
465	vldr		d5, [r6, #24] // B[6,7]
466	vmla.f32	q6, q1, d6[1]
467	vldr		d6, [r6, #40] // B[10,11]
468	vmla.f32	q7, q1, d7[1]
469	vldr		d7, [r6, #56] // B[14,15]
470
471	// unroll 2
472	vmla.f32	q4, q0, d4[0]
473	vld1.64		{d2, d3}, [r5:128]! // A
474	vmla.f32	q5, q0, d5[0]
475	add		r6, r6, r7
476	vmla.f32	q6, q0, d6[0]
477	vmla.f32	q7, q0, d7[0]
478
479	// unroll 3
480	vld1.64		{d0, d1}, [r5:128]! // A
481	vmla.f32	q4, q1, d4[1]
482	vldr		d4, [r6, #0]   // B[0,1]
483	vmla.f32	q5, q1, d5[1]
484	vldr		d5, [r6, #16]  // B[4,5]
485	vmla.f32	q6, q1, d6[1]
486	vldr		d6, [r6, #32]  // B[8,9]
487	vmla.f32	q7, q1, d7[1]
488	vldr		d7, [r6, #48]  // B[12,13]
489
490	sub		r4, r4, #4
491
492#endif
493
494	cmp		r4, #4
495	bgt		1b
496
4970:
498
499	cmp		r4, #3
500	ble		4f
501
502#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
503
504	vld1.64		{d0, d1, d2, d3}, [r6:128]! // B
505	vld1.64		{d16, d17, d18, d19}, [r5:128]! // A
506
507	vld1.64		{d4, d5, d6, d7}, [r6:128]! // B
508	vld1.64		{d20, d21, d22, d23}, [r5:128]! // A
509
510	// prefetch
511
512	// unroll 0
513	vmla.f32	q4, q8, d0[0]
514//	pld		[r5, #64]
515	vmla.f32	q5, q8, d2[0]
516//	pld		[r6, #64]
517	vmla.f32	q6, q8, d4[0]
518	vmla.f32	q7, q8, d6[0]
519
520	// unroll 1
521	vmla.f32	q4, q9, d0[1]
522	vmla.f32	q5, q9, d2[1]
523	vmla.f32	q6, q9, d4[1]
524	vmla.f32	q7, q9, d6[1]
525
526	// unroll 2
527	vmla.f32	q4, q10, d1[0]
528	vmla.f32	q5, q10, d3[0]
529	vmla.f32	q6, q10, d5[0]
530	vmla.f32	q7, q10, d7[0]
531
532	// unroll 3
533	vmla.f32	q4, q11, d1[1]
534	vmla.f32	q5, q11, d3[1]
535	vmla.f32	q6, q11, d5[1]
536	vmla.f32	q7, q11, d7[1]
537
538	add		r6, r6, r7
539	sub		r4, r4, #4
540	sub		r6, r6, #64
541
542#else // cortex a15
543
544	// unroll 0
545	vmla.f32	q4, q0, d4[0]
546	vld1.64		{d2, d3}, [r5:128]! // A
547	vmla.f32	q5, q0, d5[0]
548	vmla.f32	q6, q0, d6[0]
549	vmla.f32	q7, q0, d7[0]
550
551	// unroll 1
552	vld1.64		{d0, d1}, [r5:128]! // A
553	vmla.f32	q4, q1, d4[1]
554	vldr		d4, [r6, #8]  // B[2,3]
555	vmla.f32	q5, q1, d5[1]
556	vldr		d5, [r6, #24] // B[6,7]
557	vmla.f32	q6, q1, d6[1]
558	vldr		d6, [r6, #40] // B[10,11]
559	vmla.f32	q7, q1, d7[1]
560	vldr		d7, [r6, #56] // B[14,15]
561
562	// unroll 2
563	vmla.f32	q4, q0, d4[0]
564	vld1.64		{d2, d3}, [r5:128]! // A
565	vmla.f32	q5, q0, d5[0]
566	add		r6, r6, r7
567	vmla.f32	q6, q0, d6[0]
568	vmla.f32	q7, q0, d7[0]
569
570	// unroll 3
571//	vld1.64		{d0, d1}, [r5:128]! // A
572	vmla.f32	q4, q1, d4[1]
573//	vldr		d4, [r6, #0]   // B[0,1]
574	vmla.f32	q5, q1, d5[1]
575//	vldr		d5, [r6, #16]  // B[4,5]
576	vmla.f32	q6, q1, d6[1]
577//	vldr		d6, [r6, #32]  // B[8,9]
578	vmla.f32	q7, q1, d7[1]
579//	vldr		d7, [r6, #48]  // B[12,13]
580
581	sub		r4, r4, #4
582
583#endif
584
585	b		2f // return
586
5874: // consider clean1-up loop
588
589	cmp		r4, #0
590	ble		2f // return
591
592#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
593
594#else // cortex a15
595	sub		r5, r5, #16
596#endif
597
5983: // clean1-up loop
599
600	// unroll 0
601#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
602
603	vld1.64		{d0, d1}, [r5:128]! // A
604	vldr		s8, [r6, #0]  // B[0]
605	vldr		s9, [r6, #16] // B[4]
606	vldr		s10, [r6, #32] // B[8]
607	vldr		s11, [r6, #48] // B[12]
608	vmla.f32	q4, q0, d4[0]
609	vmla.f32	q5, q0, d4[1]
610	vmla.f32	q6, q0, d5[0]
611	vmla.f32	q7, q0, d5[1]
612
613#else // cortex a15
614
615	vld1.64		{d0, d1}, [r5:128]! // A
616	vldr		s8, [r6, #0]  // B[0]
617	vmla.f32	q4, q0, d4[0]
618	vldr		s8, [r6, #16] // B[4]
619	vmla.f32	q5, q0, d4[0]
620	vldr		s8, [r6, #32] // B[8]
621	vmla.f32	q6, q0, d4[0]
622	vldr		s8, [r6, #48] // B[12]
623	vmla.f32	q7, q0, d4[0]
624
625#endif
626
627	sub		r4, r4, #1
628	add		r6, r6, #4
629	cmp		r4, #0
630	bgt		3b
631
6322: // return
633
634
635#if MACRO_LEVEL>=2
636	.endm
637#else
638	mov		pc, lr // return
639
640#if defined(OS_LINUX)
641	.size	inner_kernel_gemm_add_nn_4x4_lib4, .-inner_kernel_gemm_add_nn_4x4_lib4
642#endif
643#endif
644
645
646
647
648
649// subroutine
650//
651// input arguments:
652// r4   <- k
653// r5   <- A
654// r6   <- B
655// r7   <- bs*sdb*sizeof(float)
656// r8   <- offsetB
657
658#if MACRO_LEVEL>=1
659	.macro INNER_EDGE_GEMM_ADD_NN_4X4_LIB4
660#else
661	.p2align 4,,15
662#if defined(OS_LINUX)
663	.type inner_edge_gemm_add_nn_4x4_lib4, %function
664inner_edge_gemm_add_nn_4x4_lib4:
665#elif defined(OS_MAC)
666_inner_edge_gemm_add_nn_4x4_lib4:
667#endif
668#endif
669
670	cmp		r8, #0
671	ble		2f // return
672
673	cmp		r4, #0
674	ble		2f // return
675
676
677	rsb		r9, r8, #4 // 4-offsetB
678	cmp		r9, r4
679//	ble		0f
680//	mov		r9, r4 // kend=min(k,4-offsetB(
681//0:
682	movgt	r9, r4 // kend=min(k,4-offsetB(
683
684//	lsl		r10, r8, #2 // offsetB*sizeof(float)
685	add		r6, r6, r8, LSL #2 // B + offsetB*sizeof(float)
686
6871:
688#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7)
689
690	vld1.64		{d0, d1}, [r5:128]! // A
691	vldr		s8, [r6, #0]  // B[0]
692	vldr		s9, [r6, #16] // B[4]
693	vldr		s10, [r6, #32] // B[8]
694	vldr		s11, [r6, #48] // B[12]
695	vmla.f32	q4, q0, d4[0]
696	vmla.f32	q5, q0, d4[1]
697	vmla.f32	q6, q0, d5[0]
698	vmla.f32	q7, q0, d5[1]
699
700#else
701
702	vld1.64		{d0, d1}, [r5:128]! // A
703	vldr		s8, [r6, #0]  // B[0]
704	vmla.f32	q4, q0, d4[0]
705	vldr		s8, [r6, #16] // B[4]
706	vmla.f32	q5, q0, d4[0]
707	vldr		s8, [r6, #32] // B[8]
708	vmla.f32	q6, q0, d4[0]
709	vldr		s8, [r6, #48] // B[12]
710	vmla.f32	q7, q0, d4[0]
711
712#endif
713
714	sub		r9, r9, #1
715	sub		r4, r4, #1
716	add		r6, r6, #4
717
718	cmp		r9, #0
719	bgt		1b
720
721	cmp		r4, #0
722	ble		2f // return
723
724	add		r6, r6, r7
725	sub		r6, r6, #16
726
7272: // return
728
729#if MACRO_LEVEL>=1
730	.endm
731#else
732	mov		pc, lr // return
733
734#if defined(OS_LINUX)
735	.size	inner_edge_gemm_add_nn_4x4_lib4, .-inner_edge_gemm_add_nn_4x4_lib4
736#endif
737#endif
738
739
740
741
742
743// subroutine
744//
745// cholesky factorization
746//
747// input arguments:
748// r4   <- inv_diag_D
749//
750// output arguments:
751// r4   <- inv_diag_D
752
753#if MACRO_LEVEL>=1
754	.macro INNER_EDGE_POTRF_4X4_LIB4 lc_zero
755#else
756	.align 3
75799: // 0
758	.word 0
759	.word 0
760
761	.p2align 4,,15
762#if defined(OS_LINUX)
763	.type inner_edge_potrf_4x4_lib4, %function
764inner_edge_potrf_4x4_lib4:
765#elif defined(OS_MAC)
766_inner_edge_potrf_4x4_lib4:
767#endif
768#endif
769
770	fconsts		s1, #112 // 1.0
771#if MACRO_LEVEL>=1
772	flds		s0, \lc_zero // 0.0
773#else
774	flds		s0, 99b // 0.0
775#endif
776
777#if 0 // scalar
778
779	// first column
780	fcmpes		s16, s0
781	fmstat
782	ble			1f
783	fsqrts		s16, s16
784	fdivs		s2, s1, s16
785	fsts		s2, [r4, #0]
7862:
787	fmuls		s17, s17, s2
788	fmuls		s18, s18, s2
789	fmuls		s19, s19, s2
790
791	// second column
792	fnmacs		s21, s17, s17
793	fnmacs		s22, s17, s18
794	fnmacs		s23, s17, s19
795	fcmpes		s21, s0
796	fmstat
797	ble			3f
798	fsqrts		s21, s21
799	fdivs		s2, s1, s21
800	fsts		s2, [r4, #4]
8014:
802	fmuls		s22, s22, s2
803	fmuls		s23, s23, s2
804
805	// third column
806	fnmacs		s26, s18, s18
807	fnmacs		s27, s18, s19
808	fnmacs		s26, s22, s22
809	fnmacs		s27, s22, s23
810	fcmpes		s16, s0
811	fmstat
812	ble			5f
813	fsqrts		s26, s26
814	fdivs		s2, s1, s26
815	fsts		s2, [r4, #8]
8166:
817	fmuls		s27, s27, s2
818
819	// fourth column
820	fnmacs		s31, s19, s19
821	fnmacs		s31, s23, s23
822	fnmacs		s31, s27, s27
823	fcmpes		s31, s0
824	fmstat
825	ble			7f
826	fsqrts		s31, s31
827	fdivs		s2, s1, s31
828	fsts		s2, [r4, #12]
829
830#else // vector
831
832	// first column
833	fcmpes		s16, s0
834	fmstat
835	ble			1f
836	fsqrts		s2, s16
837	fdivs		s2, s1, s2
838	fsts		s2, [r4, #0]
8392:
840	vmul.f32	q4, q4, d1[0]
841
842	// second column
843	vmls.f32	q5, q4, d8[1]
844	fcmpes		s21, s0
845	fmstat
846	ble			3f
847	fsqrts		s2, s21
848	fdivs		s2, s1, s2
849	fsts		s2, [r4, #4]
8504:
851	vmul.f32	q5, q5, d1[0]
852
853	// third column
854	vmls.f32	q6, q4, d9[0]
855	vmls.f32	q6, q5, d11[0]
856	fcmpes		s16, s0
857	fmstat
858	ble			5f
859	fsqrts		s2, s26
860	fdivs		s2, s1, s2
861	fsts		s2, [r4, #8]
8626:
863	vmul.f32	q6, q6, d1[0]
864
865	// fourth column
866	vmls.f32	q7, q4, d9[1]
867	vmls.f32	q7, q5, d11[1]
868	vmls.f32	q7, q6, d13[1]
869	fcmpes		s31, s0
870	fmstat
871	ble			7f
872	fsqrts		s31, s31
873	fdivs		s2, s1, s31
874	fsts		s2, [r4, #12]
875
876#endif
877
878	b			0f
879
8801:
881#if MACRO_LEVEL>=1
882	flds		s16, \lc_zero // 0.0
883#else
884	flds		s16, 99b // 0.0
885#endif
886	b			2b
887
8883:
889#if MACRO_LEVEL>=1
890	flds		s21, \lc_zero // 0.0
891#else
892	flds		s21, 99b // 0.0
893#endif
894	b			4b
895
8965:
897#if MACRO_LEVEL>=1
898	flds		s26, \lc_zero // 0.0
899#else
900	flds		s26, 99b // 0.0
901#endif
902	b			6b
903
9047:
905#if MACRO_LEVEL>=1
906	flds		s31, \lc_zero // 0.0
907#else
908	flds		s31, 99b // 0.0
909#endif
910
9110:
912
913#if MACRO_LEVEL>=1
914	.endm
915#else
916	mov		pc, lr // return
917
918#if defined(OS_LINUX)
919	.size	inner_edge_potrf_4x4_lib4, .-inner_edge_potrf_4x4_lib4
920#endif
921#endif
922
923
924// subroutine
925//
926// triangular substitution:
927// side = right
928// uplo = lower
929// tran = transposed
930// requires explicit inverse of diagonal
931//
932// input arguments:
933// r4   <- E
934// r5   <- inv_diag_E
935//
936// output arguments:
937// r4   <- E
938// r5   <- inv_diag_E
939
940#if MACRO_LEVEL>=1
941	.macro INNER_EDGE_TRSM_RLT_INV_4X4_LIB4
942#else
943	.p2align 4,,15
944#if defined(OS_LINUX)
945	.type inner_edge_trsm_rlt_inv_4x4_lib4, %function
946inner_edge_trsm_rlt_inv_4x4_lib4:
947#elif defined(OS_MAC)
948inner_edge_trsm_rlt_inv_4x4_lib4:
949#endif
950#endif
951
952	// first column
953	vldr.32		d0, [r5, #0] // E_inv[0]
954	vmul.f32	q4, q4, d0[0];
955
956	// second column
957	vldr.32		d0, [r4, #4] // E[1+4*0]
958	vmls.f32	q5, q4, d0[0];
959	vldr.32		d0, [r5, #4] // E_inv[1]
960	vmul.f32	q5, q5, d0[0];
961
962	// thirs column
963	vldr.32		d0, [r4, #8] // E[2+4*0]
964	vmls.f32	q6, q4, d0[0];
965	vldr.32		d0, [r4, #24] // E[2+4*1]
966	vmls.f32	q6, q5, d0[0];
967	vldr.32		d0, [r5, #8] // E_inv[2]
968	vmul.f32	q6, q6, d0[0];
969
970	// fourth column
971	vldr.32		d0, [r4, #12] // E[3+4*0]
972	vmls.f32	q7, q4, d0[0];
973	vldr.32		d0, [r4, #28] // E[3+4*1]
974	vmls.f32	q7, q5, d0[0];
975	vldr.32		d0, [r4, #44] // E[3+4*2]
976	vmls.f32	q7, q6, d0[0];
977	vldr.32		d0, [r5, #12] // E_inv[3]
978	vmul.f32	q7, q7, d0[0];
979
980#if MACRO_LEVEL>=1
981	.endm
982#else
983	mov		pc, lr // return
984
985#if defined(OS_LINUX)
986	.size	inner_edge_trsm_rlt_inv_4x4_lib4, .-inner_edge_trsm_rlt_inv_4x4_lib4
987#endif
988#endif
989
990
991
992
993
994// subroutine
995//
996// input arguments:
997// r4   <- alpha
998// r5   <- beta
999// r6   <- C
1000//
1001// output arguments:
1002
1003#if MACRO_LEVEL>=1
1004	.macro INNER_SCALE_AB_4X4_LIB4 lc_zero
1005#else
1006	.align 3
100799: // 0
1008	.word 0
1009	.word 0
1010//	.p2align 4,,15
1011#if defined(OS_LINUX)
1012	.type inner_scale_ab_4x4_lib4, %function
1013inner_scale_ab_4x4_lib4:
1014#elif defined(OS_MAC)
1015_inner_scale_ab_4x4_lib4:
1016#endif
1017#endif
1018
1019	flds		s8, [r4, #0] // alpha
1020	flds		s9, [r5, #0] // beta
1021#if MACRO_LEVEL>=2
1022	flds		s10, \lc_zero // 0.0
1023#else
1024	flds		s10, 99b // 0.0
1025#endif
1026
1027	fcmpes		s9, s10
1028	vmul.f32	q4, q4, d4[0]
1029	vmul.f32	q5, q5, d4[0]
1030	vmul.f32	q6, q6, d4[0]
1031	vmul.f32	q7, q7, d4[0]
1032	fmstat
1033
1034	beq		0f // end
1035
1036	vld1.64		{d0, d1, d2, d3}, [r6:128]!
1037	vmla.f32	q4, q0, d4[1]
1038	vmla.f32	q5, q1, d4[1]
1039	vld1.64		{d0, d1, d2, d3}, [r6:128]!
1040	vmla.f32	q6, q0, d4[1]
1041	vmla.f32	q7, q1, d4[1]
1042
10430:
1044
1045#if MACRO_LEVEL>=1
1046	.endm
1047#else
1048	mov		pc, lr // return
1049
1050#if defined(OS_LINUX)
1051	.size	inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
1052#endif
1053#endif
1054
1055
1056
1057
1058
1059// subroutine
1060//
1061// input arguments:
1062// r4   <- beta
1063// r5   <- C
1064//
1065// output arguments:
1066
1067#if MACRO_LEVEL>=1
1068	.macro INNER_SCALE_M1B_4X4_LIB4 lc_zero
1069#else
1070	.align 3
107199: // 0
1072	.word 0
1073	.word 0
1074//	.p2align 4,,15
1075#if defined(OS_LINUX)
1076	.type inner_scale_m1b_4x4_lib4, %function
1077inner_scale_m1b_4x4_lib4:
1078#elif defined(OS_MAC)
1079_inner_scale_m1b_4x4_lib4:
1080#endif
1081#endif
1082
1083	flds		s8, [r4, #0] // beta
1084#if MACRO_LEVEL>=2
1085	flds		s9, \lc_zero // 0.0
1086#else
1087	flds		s9, 99b // 0.0
1088#endif
1089
1090	fcmpes		s8, s9
1091	vneg.f32	q4, q4
1092	vneg.f32	q5, q5
1093	vneg.f32	q6, q6
1094	vneg.f32	q7, q7
1095	fmstat
1096
1097	beq			0f // end
1098
1099	vld1.64		{d0, d1, d2, d3}, [r5:128]!
1100	vmla.f32	q4, q0, d4[0]
1101	vmla.f32	q5, q1, d4[0]
1102	vld1.64		{d0, d1, d2, d3}, [r5:128]!
1103	vmla.f32	q6, q0, d4[0]
1104	vmla.f32	q7, q1, d4[0]
1105
11060:
1107
1108#if MACRO_LEVEL>=1
1109	.endm
1110#else
1111	mov		pc, lr // return
1112
1113#if defined(OS_LINUX)
1114	.size	inner_scale_m1b_4x4_lib4, .-inner_scale_m1b_4x4_lib4
1115#endif
1116#endif
1117
1118
1119
1120
1121
1122// subroutine
1123//
1124// input arguments:
1125// r4   <- C
1126//
1127// output arguments:
1128
1129#if MACRO_LEVEL>=1
1130	.macro INNER_SCALE_M11_4X4_LIB4
1131#else
1132//	.p2align 4,,15
1133#if defined(OS_LINUX)
1134	.type inner_scale_m11_4x4_lib4, %function
1135inner_scale_m11_4x4_lib4:
1136#elif defined(OS_MAC)
1137_inner_scale_11_4x4_lib4:
1138#endif
1139#endif
1140
1141	vld1.64		{d0, d1, d2, d3}, [r4:128]!
1142	vsub.f32	q4, q0, q4
1143	vsub.f32	q5, q1, q5
1144	vld1.64		{d0, d1, d2, d3}, [r4:128]!
1145	vsub.f32	q6, q0, q6
1146	vsub.f32	q7, q1, q7
1147
1148#if MACRO_LEVEL>=1
1149	.endm
1150#else
1151	mov		pc, lr // return
1152
1153#if defined(OS_LINUX)
1154	.size	inner_scale_m11_4x4_lib4, .-inner_scale_m11_4x4_lib4
1155#endif
1156#endif
1157
1158
1159
1160
1161
1162// subroutine
1163//
1164// input arguments:
1165// r4   <- D
1166//
1167// output arguments:
1168
1169#if MACRO_LEVEL>=1
1170	.macro INNER_STORE_4X4_LIB4
1171#else
1172//	.p2align 4,,15
1173#if defined(OS_LINUX)
1174	.type inner_store_4x4_lib4, %function
1175inner_store_4x4_lib4:
1176#elif defined(OS_MAC)
1177_inner_store_4x4_lib4:
1178#endif
1179#endif
1180
1181	vst1.64		{d8, d9, d10, d11}, [r4:128]!
1182	vst1.64		{d12, d13, d14, d15}, [r4:128]!
1183
1184#if MACRO_LEVEL>=1
1185	.endm
1186#else
1187	mov		pc, lr // return
1188
1189#if defined(OS_LINUX)
1190	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
1191#endif
1192#endif
1193
1194
1195
1196
1197
1198// subroutine
1199//
1200// input arguments:
1201// r4   <- D
1202//
1203// output arguments:
1204
1205#if MACRO_LEVEL>=1
1206	.macro INNER_STORE_4X4_L_LIB4
1207#else
1208//	.p2align 4,,15
1209#if defined(OS_LINUX)
1210	.type inner_store_4x4_l_lib4, %function
1211inner_store_4x4_l_lib4:
1212#elif defined(OS_MAC)
1213_inner_store_4x4_l_lib4:
1214#endif
1215#endif
1216
1217	// first column
1218	vstr.64		d8, [r4, #0]
1219	vstr.64		d9, [r4, #8]
1220	// second column
1221	vstr.32		s21, [r4, #20]
1222	vstr.64		d11, [r4, #24]
1223	// third column
1224	vstr.64		d13, [r4, #40]
1225	// fourth column
1226	vstr.64		s31, [r4, #60]
1227
1228#if MACRO_LEVEL>=1
1229	.endm
1230#else
1231	mov		pc, lr // return
1232
1233#if defined(OS_LINUX)
1234	.size	inner_store_4x4_l_lib4, .-inner_store_4x4_l_lib4
1235#endif
1236#endif
1237
1238
1239
1240
1241
1242	.align 3
124399: // 0
1244	.word 0
1245	.word 0
1246
1247
1248
1249
1250
1251//                               r0        r1             r2         r3         sp+0          sp+4       sp+8
1252// void kernel_sgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
1253
1254//	.p2align 4,,15
1255#if defined(OS_LINUX)
1256	.global	kernel_sgemm_nt_4x4_lib4
1257	.type	kernel_sgemm_nt_4x4_lib4, %function
1258kernel_sgemm_nt_4x4_lib4:
1259#elif defined(OS_MAC)
1260	.global	kernel_sgemm_nt_4x4_lib4
1261_kernel_sgemm_nt_4x4_lib4:
1262#endif
1263
1264	PROLOGUE
1265
1266
1267
1268	// zero accumulation registers
1269	vldr	d8, 99b
1270	vldr	d9, 99b
1271	vmov	q5, q4
1272	vmov	q6, q4
1273	vmov	q7, q4
1274
1275
1276
1277	// call inner kernel dgemm nt
1278	mov		r4, r0 // kmax
1279	mov		r5, r2 // A
1280	mov		r6, r3 // B
1281
1282#if MACRO_LEVEL>=2
1283	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
1284#else
1285#if defined(OS_LINUX)
1286	bl	inner_kernel_gemm_add_nt_4x4_lib4
1287#elif defined(OS_MAC)
1288	bl	_inner_kernel_gemm_add_nt_4x4_lib4
1289#endif
1290#endif
1291
1292
1293
1294	// call inner blend for generic alpha and beta
1295	mov		r4, r1 // alpha
1296	ldr		r5, [fp, #0] // beta
1297	ldr		r6, [fp, #4] // C
1298
1299#if MACRO_LEVEL>=1
1300	INNER_SCALE_AB_4X4_LIB4 99f
1301#else
1302#if defined(OS_LINUX)
1303	bl inner_scale_ab_4x4_lib4
1304#elif defined(OS_MAC)
1305	bl _inner_scale_ab_4x4_lib4
1306#endif
1307#endif
1308
1309
1310
1311	// store n
1312	ldr		r4, [fp, #8] // D
1313
1314#if MACRO_LEVEL>=1
1315	INNER_STORE_4X4_LIB4
1316#else
1317#if defined(OS_LINUX)
1318	bl inner_store_4x4_lib4
1319#elif defined(OS_MAC)
1320	bl _inner_store_4x4_lib4
1321#endif
1322#endif
1323
1324
1325
1326	EPILOGUE
1327
1328#if defined(OS_LINUX)
1329	.size	kernel_sgemm_nt_4x4_lib4, .-kernel_sgemm_nt_4x4_lib4
1330#endif
1331
1332
1333
1334
1335
1336	.align 3
133799: // 0
1338	.word 0
1339	.word 0
1340
1341
1342
1343
1344
1345//                               r0        r1             r2         r3           sp+0       sp+4     sp+8          sp+12      sp+16
1346// void kernel_sgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
1347
1348//	.p2align 4,,15
1349#if defined(OS_LINUX)
1350	.global	kernel_sgemm_nn_4x4_lib4
1351	.type	kernel_sgemm_nn_4x4_lib4, %function
1352kernel_sgemm_nn_4x4_lib4:
1353#elif defined(OS_MAC)
1354	.global	kernel_sgemm_nn_4x4_lib4
1355_kernel_sgemm_nn_4x4_lib4:
1356#endif
1357
1358	PROLOGUE
1359
1360
1361
1362	// zero accumulation registers
1363	vldr	d8, 99b
1364	vldr	d9, 99b
1365	vmov	q5, q4
1366	vmov	q6, q4
1367	vmov	q7, q4
1368
1369
1370
1371	// call inner kernel dgemm nt
1372	mov		r4, r0 // kmax
1373	mov		r5, r2 // A
1374	ldr		r6, [fp, #0] // B
1375	ldr		r7, [fp, #4] // sdb
1376	lsl		r7, r7, #4 // 4*sizeof(float)*sdb
1377	mov		r8, r3 // offsetB
1378
1379#if MACRO_LEVEL>=1
1380	INNER_EDGE_GEMM_ADD_NN_4X4_LIB4
1381#else
1382#if defined(OS_LINUX)
1383	bl	inner_edge_gemm_add_nn_4x4_lib4
1384#elif defined(OS_MAC)
1385	bl	_inner_edge_gemm_add_nn_4x4_lib4
1386#endif
1387#endif
1388
1389#if MACRO_LEVEL>=2
1390	INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
1391#else
1392#if defined(OS_LINUX)
1393	bl	inner_kernel_gemm_add_nn_4x4_lib4
1394#elif defined(OS_MAC)
1395	bl	_inner_kernel_gemm_add_nn_4x4_lib4
1396#endif
1397#endif
1398
1399
1400
1401	// call inner blend for generic alpha and beta
1402	mov		r4, r1 // alpha
1403	ldr		r5, [fp, #8] // beta
1404	ldr		r6, [fp, #12] // C
1405
1406#if MACRO_LEVEL>=1
1407	INNER_SCALE_AB_4X4_LIB4 99f
1408#else
1409#if defined(OS_LINUX)
1410	bl inner_scale_ab_4x4_lib4
1411#elif defined(OS_MAC)
1412	bl _inner_scale_ab_4x4_lib4
1413#endif
1414#endif
1415
1416
1417
1418	// store n
1419	ldr		r4, [fp, #16] // D
1420
1421#if MACRO_LEVEL>=1
1422	INNER_STORE_4X4_LIB4
1423#else
1424#if defined(OS_LINUX)
1425	bl inner_store_4x4_lib4
1426#elif defined(OS_MAC)
1427	bl _inner_store_4x4_lib4
1428#endif
1429#endif
1430
1431
1432
1433	EPILOGUE
1434
1435#if defined(OS_LINUX)
1436	.size	kernel_sgemm_nn_4x4_lib4, .-kernel_sgemm_nn_4x4_lib4
1437#endif
1438
1439
1440
1441
1442
1443	.align 3
144499: // { 0 }
1445	.word 0
1446	.word 0
1447
1448
1449
1450
1451
1452//                                      r0        r1         r2         r3         sp+0       sp+4       rsp+8         esp+12
1453// void kernel_strsm_nt_rl_inv_4x4_lib4(int kmax, double *A, double *B, double *beta, double *C, double *D, double *E, double *inv_diag_E);
1454
1455//	.p2align 4,,15
1456#if defined(OS_LINUX)
1457	.globl kernel_strsm_nt_rl_inv_4x4_lib4
1458	.type kernel_strsm_nt_rl_inv_4x4_lib4, %function
1459kernel_strsm_nt_rl_inv_4x4_lib4:
1460#elif defined(OS_MAC)
1461	.globl _kernel_strsm_nt_rl_inv_4x4_lib4
1462_kernel_strsm_nt_rl_inv_4x4_lib4:
1463#endif
1464
1465	PROLOGUE
1466
1467
1468
1469	// zero accumulation registers
1470	vldr	d8, 99b
1471	vldr	d9, 99b
1472	vmov	q5, q4
1473	vmov	q6, q4
1474	vmov	q7, q4
1475
1476
1477
1478	// call inner kernel dgemm nt
1479	mov		r4, r0 // kmax
1480	mov		r5, r1 // A
1481	mov		r6, r2 // B
1482
1483#if MACRO_LEVEL>=2
1484	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
1485#else
1486#if defined(OS_LINUX)
1487	bl	inner_kernel_gemm_add_nt_4x4_lib4
1488#elif defined(OS_MAC)
1489	bl	_inner_kernel_gemm_add_nt_4x4_lib4
1490#endif
1491#endif
1492
1493
1494
1495	// call inner blend for alpha=1.0 and beta=1.0
1496	mov		r4, r3 // beta
1497	ldr		r5, [fp, #0] // C
1498
1499#if MACRO_LEVEL>=1
1500	INNER_SCALE_M1B_4X4_LIB4
1501#else
1502#if defined(OS_LINUX)
1503	bl inner_scale_m1b_4x4_lib4
1504#elif defined(OS_MAC)
1505	bl _inner_scale_m1b_4x4_lib4
1506#endif
1507#endif
1508
1509
1510
1511	// factorization
1512	ldr		r4, [fp, #8] // E
1513	ldr		r5, [fp, #12] // inv_diag_E
1514
1515#if MACRO_LEVEL>=1
1516	INNER_EDGE_TRSM_RLT_INV_4X4_LIB4
1517#else
1518#if defined(OS_LINUX)
1519	bl inner_edge_trsm_rlt_inv_4x4_lib4
1520#elif defined(OS_MAC)
1521	bl _inner_edge_trsm_rlt_inv_4x4_lib4
1522#endif
1523#endif
1524
1525
1526
1527	// store l
1528	ldr		r4, [fp, #4] // D
1529
1530#if MACRO_LEVEL>=1
1531	INNER_STORE_4X4_LIB4
1532#else
1533#if defined(OS_LINUX)
1534	bl inner_store_4x4_lib4
1535#elif defined(OS_MAC)
1536	bl _inner_store_4x4_lib4
1537#endif
1538#endif
1539
1540
1541
1542	EPILOGUE
1543
1544#if defined(OS_LINUX)
1545	.size	kernel_strsm_nt_rl_inv_4x4_lib4, .-kernel_strsm_nt_rl_inv_4x4_lib4
1546#endif
1547
1548
1549
1550
1551
1552	.align 3
155399: // 0
1554	.word 0
1555	.word 0
1556
1557
1558
1559
1560
1561//                                  r0        r1         r2         r3         sp+0       sp+4
1562// void kernel_spotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D);
1563
1564//	.p2align 4,,15
1565#if defined(OS_LINUX)
1566	.globl kernel_spotrf_nt_l_4x4_lib4
1567	.type kernel_spotrf_nt_l_4x4_lib4, %function
1568kernel_spotrf_nt_l_4x4_lib4:
1569#elif defined(OS_MAC)
1570	.globl _kernel_spotrf_nt_l_4x4_lib4
1571_kernel_spotrf_nt_l_4x4_lib4:
1572#endif
1573
1574	PROLOGUE
1575
1576
1577
1578	// zero accumulation registers
1579	vldr	d8, 99b
1580	vldr	d9, 99b
1581	vmov	q5, q4
1582	vmov	q6, q4
1583	vmov	q7, q4
1584
1585
1586
1587	// call inner kernel dgemm nt
1588	mov		r4, r0 // kmax
1589	mov		r5, r1 // A
1590	mov		r6, r2 // B
1591
1592#if MACRO_LEVEL>=2
1593	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
1594#else
1595#if defined(OS_LINUX)
1596	bl	inner_kernel_gemm_add_nt_4x4_lib4
1597#elif defined(OS_MAC)
1598	bl	_inner_kernel_gemm_add_nt_4x4_lib4
1599#endif
1600#endif
1601
1602
1603
1604	// call inner blend for alpha=1.0 and beta=1.0
1605	mov		r4, r3 // C
1606
1607#if MACRO_LEVEL>=1
1608	INNER_SCALE_M11_4X4_LIB4
1609#else
1610#if defined(OS_LINUX)
1611	bl inner_scale_m11_4x4_lib4
1612#elif defined(OS_MAC)
1613	bl _inner_scale_m11_4x4_lib4
1614#endif
1615#endif
1616
1617
1618
1619	// factorization
1620	ldr		r4, [fp, #4] // inv_diag_D
1621
1622#if MACRO_LEVEL>=1
1623	INNER_EDGE_POTRF_4X4_LIB4 99f
1624#else
1625#if defined(OS_LINUX)
1626	bl inner_edge_potrf_4x4_lib4
1627#elif defined(OS_MAC)
1628	bl _inner_edge_potrf_4x4_lib4
1629#endif
1630#endif
1631
1632
1633
1634	// store l
1635	ldr		r4, [fp, #0] // D
1636
1637#if MACRO_LEVEL>=1
1638	INNER_STORE_4X4_L_LIB4
1639#else
1640#if defined(OS_LINUX)
1641	bl inner_store_4x4_l_lib4
1642#elif defined(OS_MAC)
1643	bl _inner_store_4x4_l_lib4
1644#endif
1645#endif
1646
1647
1648
1649	EPILOGUE
1650
1651#if defined(OS_LINUX)
1652	.size	kernel_spotrf_nt_l_4x4_lib4, .-kernel_spotrf_nt_l_4x4_lib4
1653#endif
1654
1655
1656
1657
1658
1659	.align 3
166099: // { 0 }
1661	.word 0
1662	.word 0
1663
1664
1665
1666
1667
1668#if defined(BLAS_API)
1669
1670#include "kernel_sgemm_4x4_lib.S"
1671
1672#endif
1673