1/******************************************************************************* 2Copyright (c) 2015, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*******************************************************************************/ 27 28#define ASSEMBLER 29#include "common.h" 30 31#define N x0 /* vector length */ 32#define X x1 /* X vector address */ 33#define INC_X x2 /* X stride */ 34#define I x5 /* loop variable */ 35 36/******************************************************************************* 37* Macro definitions 38*******************************************************************************/ 39 40#define REG0 wzr 41#define SUMF s0 42#define TMPF s1 43#define TMPVF {v1.s}[0] 44#define SZ 4 45 46/******************************************************************************/ 47 48.macro KERNEL_F1 49 ld1 {v1.2s}, [X], #8 50 fabs v1.2s, v1.2s 51 ext v2.8b, v1.8b, v1.8b, #4 52 fadd TMPF, TMPF, s2 53 fadd SUMF, SUMF, TMPF 54.endm 55 56.macro KERNEL_F8 57 ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] 58 add X, X, #64 59 fabs v1.4s, v1.4s 60 fabs v2.4s, v2.4s 61 fabs v3.4s, v3.4s 62 fabs v4.4s, v4.4s 63 64 PRFM PLDL1KEEP, [X, #1024] 65 66 fadd v1.4s, v1.4s, v2.4s 67 fadd v3.4s, v3.4s, v4.4s 68 fadd v0.4s, v0.4s, v1.4s 69 fadd v0.4s, v0.4s, v3.4s 70.endm 71 72.macro KERNEL_F8_FINALIZE 73 ext v1.16b, v0.16b, v0.16b, #8 74 fadd v0.2s, v0.2s, v1.2s 75 faddp SUMF, v0.2s 76.endm 77 78.macro INIT_S 79 lsl INC_X, INC_X, #3 80.endm 81 82.macro KERNEL_S1 83 ld1 {v1.2s}, [X], INC_X 84 fabs v1.2s, v1.2s 85 ext v2.8b, v1.8b, v1.8b, #4 86 fadd TMPF, TMPF, s2 87 fadd SUMF, SUMF, TMPF 88 89.endm 90 91/******************************************************************************* 92* End of macro definitions 93*******************************************************************************/ 94 95 PROLOGUE 96 97 fmov SUMF, REG0 98 fmov s1, SUMF 99 100 cmp N, xzr 101 ble .Lcasum_kernel_L999 102 cmp INC_X, xzr 103 ble .Lcasum_kernel_L999 104 105 cmp INC_X, #1 106 bne .Lcasum_kernel_S_BEGIN 107 108.Lcasum_kernel_F_BEGIN: 109 110 asr I, N, #3 111 cmp I, xzr 112 beq .Lcasum_kernel_F1 113 114.Lcasum_kernel_F8: 115 116 KERNEL_F8 117 118 subs I, I, #1 119 bne .Lcasum_kernel_F8 120 121 KERNEL_F8_FINALIZE 122 123.Lcasum_kernel_F1: 124 125 ands I, N, #7 126 ble .Lcasum_kernel_L999 127 128.Lcasum_kernel_F10: 129 130 KERNEL_F1 131 132 subs I, I, #1 133 bne .Lcasum_kernel_F10 134 135.Lcasum_kernel_L999: 136 ret 137 138.Lcasum_kernel_S_BEGIN: 139 140 INIT_S 141 142 asr I, N, #2 143 cmp I, xzr 144 ble .Lcasum_kernel_S1 145 146.Lcasum_kernel_S4: 147 148 KERNEL_S1 149 KERNEL_S1 150 KERNEL_S1 151 KERNEL_S1 152 153 subs I, I, #1 154 bne .Lcasum_kernel_S4 155 156.Lcasum_kernel_S1: 157 158 ands I, N, #3 159 ble .Lcasum_kernel_L999 160 161.Lcasum_kernel_S10: 162 163 KERNEL_S1 164 165 subs I, I, #1 166 bne .Lcasum_kernel_S10 167 168 ret 169 170 EPILOGUE 171