1 /***************************************************************************
2 Copyright (c) 2013-2018, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28
29 #include "common.h"
30
31 #define offset_0 0
32 #define offset_1 16
33 #define offset_2 32
34 #define offset_3 48
35 #define offset_4 64
36 #define offset_5 80
37 #define offset_6 96
38 #define offset_7 112
39 #define offset_8 128
40 #define offset_9 144
41 #define offset_10 160
42 #define offset_11 176
43 #define offset_12 192
44 #define offset_13 208
45 #define offset_14 224
46 #define offset_15 240
47
48
49 #if defined(__VEC__) || defined(__ALTIVEC__)
50
51 #ifndef HAVE_KERNEL_8
52 #include <altivec.h>
53
saxpy_kernel_64(BLASLONG n,FLOAT * x,FLOAT * y,FLOAT alpha)54 static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
55 {
56 BLASLONG i = 0;
57 __vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha};
58 __vector float * vptr_y =(__vector float *)y;
59 __vector float * vptr_x =(__vector float *)x;
60
61 for(; i<n/4; i+=16){
62
63
64 register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
65 register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
66 register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
67 register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
68 register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
69 register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
70 register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
71 register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
72 register __vector float vy_8 = vec_vsx_ld( offset_8 ,vptr_y ) ;
73 register __vector float vy_9 = vec_vsx_ld( offset_9 ,vptr_y ) ;
74 register __vector float vy_10 = vec_vsx_ld( offset_10 ,vptr_y ) ;
75 register __vector float vy_11 = vec_vsx_ld( offset_11 ,vptr_y ) ;
76 register __vector float vy_12 = vec_vsx_ld( offset_12 ,vptr_y ) ;
77 register __vector float vy_13 = vec_vsx_ld( offset_13 ,vptr_y ) ;
78 register __vector float vy_14 = vec_vsx_ld( offset_14 ,vptr_y ) ;
79 register __vector float vy_15 = vec_vsx_ld( offset_15 ,vptr_y ) ;
80
81 register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
82 register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
83 register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
84 register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
85 register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
86 register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
87 register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
88 register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
89 register __vector float vx_8 = vec_vsx_ld( offset_8 ,vptr_x ) ;
90 register __vector float vx_9 = vec_vsx_ld( offset_9 ,vptr_x ) ;
91 register __vector float vx_10 = vec_vsx_ld( offset_10 ,vptr_x ) ;
92 register __vector float vx_11 = vec_vsx_ld( offset_11 ,vptr_x ) ;
93 register __vector float vx_12 = vec_vsx_ld( offset_12 ,vptr_x ) ;
94 register __vector float vx_13 = vec_vsx_ld( offset_13 ,vptr_x ) ;
95 register __vector float vx_14 = vec_vsx_ld( offset_14 ,vptr_x ) ;
96 register __vector float vx_15 = vec_vsx_ld( offset_15 ,vptr_x ) ;
97 vy_0 += vx_0*v_a;
98 vy_1 += vx_1*v_a;
99 vy_2 += vx_2*v_a;
100 vy_3 += vx_3*v_a;
101 vy_4 += vx_4*v_a;
102 vy_5 += vx_5*v_a;
103 vy_6 += vx_6*v_a;
104 vy_7 += vx_7*v_a;
105 vy_8 += vx_8*v_a;
106 vy_9 += vx_9*v_a;
107 vy_10 += vx_10*v_a;
108 vy_11 += vx_11*v_a;
109 vy_12 += vx_12*v_a;
110 vy_13 += vx_13*v_a;
111 vy_14 += vx_14*v_a;
112 vy_15 += vx_15*v_a;
113
114 vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
115 vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
116 vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
117 vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
118 vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
119 vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
120 vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
121 vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
122 vec_vsx_st( vy_8, offset_8 ,vptr_y ) ;
123 vec_vsx_st( vy_9, offset_9 ,vptr_y ) ;
124 vec_vsx_st( vy_10, offset_10 ,vptr_y ) ;
125 vec_vsx_st( vy_11, offset_11 ,vptr_y ) ;
126 vec_vsx_st( vy_12, offset_12 ,vptr_y ) ;
127 vec_vsx_st( vy_13, offset_13 ,vptr_y ) ;
128 vec_vsx_st( vy_14, offset_14 ,vptr_y ) ;
129 vec_vsx_st( vy_15, offset_15 ,vptr_y ) ;
130
131 vptr_x+=16;
132 vptr_y+=16;
133
134 /*
135
136 v_y[i] += v_a * v_x[i];
137 v_y[i+1] += v_a * v_x[i+1];
138 v_y[i+2] += v_a * v_x[i+2];
139 v_y[i+3] += v_a * v_x[i+3];
140 v_y[i+4] += v_a * v_x[i+4];
141 v_y[i+5] += v_a * v_x[i+5];
142 v_y[i+6] += v_a * v_x[i+6];
143 v_y[i+7] += v_a * v_x[i+7];
144 v_y[i+8] += v_a * v_x[i+8];
145 v_y[i+9] += v_a * v_x[i+9];
146 v_y[i+10] += v_a * v_x[i+10];
147 v_y[i+11] += v_a * v_x[i+11];
148 v_y[i+12] += v_a * v_x[i+12];
149 v_y[i+13] += v_a * v_x[i+13];
150 v_y[i+14] += v_a * v_x[i+14];
151 v_y[i+15] += v_a * v_x[i+15];
152 */
153 }
154 }
155 #endif
156 #endif
157
CNAME(BLASLONG n,BLASLONG dummy0,BLASLONG dummy1,FLOAT da,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT * dummy,BLASLONG dummy2)158 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
159 {
160 BLASLONG i=0;
161 BLASLONG ix=0,iy=0;
162
163 if ( n <= 0 ) return(0);
164
165 if ( (inc_x == 1) && (inc_y == 1) )
166 {
167
168 BLASLONG n1 = n & -64;
169 #if defined(__VEC__) || defined(__ALTIVEC__)
170
171 if ( n1 )
172 saxpy_kernel_64(n1, x, y, da);
173
174 i = n1;
175 #endif
176 while(i < n)
177 {
178
179 y[i] += da * x[i] ;
180 i++ ;
181
182 }
183 return(0);
184
185
186 }
187
188 BLASLONG n1 = n & -4;
189
190 while(i < n1)
191 {
192
193 FLOAT m1 = da * x[ix] ;
194 FLOAT m2 = da * x[ix+inc_x] ;
195 FLOAT m3 = da * x[ix+2*inc_x] ;
196 FLOAT m4 = da * x[ix+3*inc_x] ;
197
198 y[iy] += m1 ;
199 y[iy+inc_y] += m2 ;
200 y[iy+2*inc_y] += m3 ;
201 y[iy+3*inc_y] += m4 ;
202
203 ix += inc_x*4 ;
204 iy += inc_y*4 ;
205 i+=4 ;
206
207 }
208
209 while(i < n)
210 {
211
212 y[iy] += da * x[ix] ;
213 ix += inc_x ;
214 iy += inc_y ;
215 i++ ;
216
217 }
218 return(0);
219
220 }
221
222
223
224