1@ static inline void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); 2 .global volk_32f_x2_add_32f_a_neonpipeline 3volk_32f_x2_add_32f_a_neonpipeline: 4 @ r0 - cVector: pointer to output array 5 @ r1 - aVector: pointer to input array 1 6 @ r2 - bVector: pointer to input array 2 7 @ r3 - num_points: number of items to process 8 cVector .req r0 9 aVector .req r1 10 bVector .req r2 11 num_points .req r3 12 quarterPoints .req r7 13 number .req r8 14 aVal .req q0 @ d0-d1 15 bVal .req q1 @ d2-d3 16 cVal .req q2 @ d4-d5 17 18 stmfd sp!, {r7, r8, sl} @ prologue - save register states 19 20 pld [aVector, #128] @ pre-load hint - this is implementation specific! 21 pld [bVector, #128] @ pre-load hint - this is implementation specific! 22 23 movs quarterPoints, num_points, lsr #2 24 beq .loop2 @ if zero into quarterPoints 25 26 mov number, quarterPoints 27 28 @ Optimizing for pipeline 29 vld1.32 {d0-d1}, [aVector:128]! @ aVal 30 vld1.32 {d2-d3}, [bVector:128]! @ bVal 31 subs number, number, #1 32 beq .flushpipe 33 34.loop1: 35 pld [aVector, #128] @ pre-load hint - this is implementation specific! 36 pld [bVector, #128] @ pre-load hint - this is implementation specific! 37 vadd.f32 cVal, bVal, aVal 38 vld1.32 {d0-d1}, [aVector:128]! @ aVal 39 vld1.32 {d2-d3}, [bVector:128]! @ bVal 40 vst1.32 {d4-d5}, [cVector:128]! @ cVal 41 42 subs number, number, #1 43 bne .loop1 @ first loop 44 45.flushpipe: 46 @ One more time 47 vadd.f32 cVal, bVal, aVal 48 vst1.32 {d4-d5}, [cVector:128]! @ cVal 49 50 mov number, quarterPoints, asl #2 51 52.loop2: 53 cmp num_points, number 54 bls .done 55 56 vld1.32 {d0[0]}, [aVector]! 57 vld1.32 {d0[1]}, [bVector]! 58 vadd.f32 s2, s1, s0 59 vst1.32 {d1[0]}, [cVector]! 60 add number, number, #1 61 b .loop2 62 63.done: 64 ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states 65 bx lr 66