1@ static inline void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
2	.global	volk_32f_x2_add_32f_a_neonpipeline
3volk_32f_x2_add_32f_a_neonpipeline:
4	@ r0 - cVector: pointer to output array
5	@ r1 - aVector: pointer to input array 1
6	@ r2 - bVector: pointer to input array 2
7	@ r3 - num_points: number of items to process
8	cVector .req r0
9	aVector .req r1
10	bVector .req r2
11	num_points .req r3
12	quarterPoints .req r7
13	number .req r8
14	aVal .req q0 @ d0-d1
15	bVal .req q1 @ d2-d3
16	cVal .req q2 @ d4-d5
17
18	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
19
20	pld [aVector, #128] @ pre-load hint - this is implementation specific!
21	pld [bVector, #128] @ pre-load hint - this is implementation specific!
22
23	movs quarterPoints, num_points, lsr #2
24	beq .loop2 @ if zero into quarterPoints
25
26	mov number, quarterPoints
27
28	@ Optimizing for pipeline
29	vld1.32	{d0-d1}, [aVector:128]!	@ aVal
30	vld1.32	{d2-d3}, [bVector:128]!	@ bVal
31	subs number, number, #1
32    beq .flushpipe
33
34.loop1:
35	pld [aVector, #128] @ pre-load hint - this is implementation specific!
36	pld [bVector, #128] @ pre-load hint - this is implementation specific!
37	vadd.f32 cVal, bVal, aVal
38	vld1.32 {d0-d1}, [aVector:128]! @ aVal
39	vld1.32 {d2-d3}, [bVector:128]! @ bVal
40	vst1.32	{d4-d5}, [cVector:128]! @ cVal
41
42	subs number, number, #1
43	bne	.loop1	@ first loop
44
45.flushpipe:
46	@ One more time
47	vadd.f32 cVal, bVal, aVal
48	vst1.32	{d4-d5}, [cVector:128]! @ cVal
49
50	mov	number, quarterPoints, asl #2
51
52.loop2:
53	cmp	num_points, number
54	bls	.done
55
56	vld1.32 {d0[0]}, [aVector]!
57	vld1.32 {d0[1]}, [bVector]!
58	vadd.f32 s2, s1, s0
59	vst1.32 {d1[0]}, [cVector]!
60	add number, number, #1
61	b .loop2
62
63.done:
64	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
65	bx	lr
66