1 /***************************************************************************
2 Copyright (c) 2013-2018, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27 
28 
29 #include "common.h"
30 
31 #define  offset_0 0
32 #define  offset_1 16
33 #define  offset_2 32
34 #define  offset_3 48
35 #define  offset_4 64
36 #define  offset_5 80
37 #define  offset_6 96
38 #define  offset_7 112
39 #define  offset_8 128
40 #define  offset_9 144
41 #define  offset_10 160
42 #define  offset_11 176
43 #define  offset_12 192
44 #define  offset_13 208
45 #define  offset_14 224
46 #define  offset_15 240
47 
48 
49 #if defined(__VEC__) || defined(__ALTIVEC__)
50 
51 #ifndef HAVE_KERNEL_8
52 #include <altivec.h>
53 
saxpy_kernel_64(BLASLONG n,FLOAT * x,FLOAT * y,FLOAT alpha)54 static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
55 {
56     BLASLONG  i = 0;
57     __vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha};
58     __vector float * vptr_y =(__vector float *)y;
59     __vector float * vptr_x =(__vector float *)x;
60 
61     for(; i<n/4; i+=16){
62 
63 
64         register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
65         register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
66         register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
67         register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
68         register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
69         register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
70         register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
71         register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
72         register __vector float vy_8 = vec_vsx_ld( offset_8 ,vptr_y ) ;
73         register __vector float vy_9 = vec_vsx_ld( offset_9 ,vptr_y ) ;
74         register __vector float vy_10 = vec_vsx_ld( offset_10 ,vptr_y ) ;
75         register __vector float vy_11 = vec_vsx_ld( offset_11 ,vptr_y ) ;
76         register __vector float vy_12 = vec_vsx_ld( offset_12 ,vptr_y ) ;
77         register __vector float vy_13 = vec_vsx_ld( offset_13 ,vptr_y ) ;
78         register __vector float vy_14 = vec_vsx_ld( offset_14 ,vptr_y ) ;
79         register __vector float vy_15 = vec_vsx_ld( offset_15 ,vptr_y ) ;
80 
81         register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
82         register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
83         register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
84         register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
85         register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
86         register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
87         register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
88         register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
89         register __vector float vx_8 = vec_vsx_ld( offset_8 ,vptr_x ) ;
90         register __vector float vx_9 = vec_vsx_ld( offset_9 ,vptr_x ) ;
91         register __vector float vx_10 = vec_vsx_ld( offset_10 ,vptr_x ) ;
92         register __vector float vx_11 = vec_vsx_ld( offset_11 ,vptr_x ) ;
93         register __vector float vx_12 = vec_vsx_ld( offset_12 ,vptr_x ) ;
94         register __vector float vx_13 = vec_vsx_ld( offset_13 ,vptr_x ) ;
95         register __vector float vx_14 = vec_vsx_ld( offset_14 ,vptr_x ) ;
96         register __vector float vx_15 = vec_vsx_ld( offset_15 ,vptr_x ) ;
97         vy_0 += vx_0*v_a;
98         vy_1 += vx_1*v_a;
99         vy_2 += vx_2*v_a;
100         vy_3 += vx_3*v_a;
101         vy_4 += vx_4*v_a;
102         vy_5 += vx_5*v_a;
103         vy_6 += vx_6*v_a;
104         vy_7 += vx_7*v_a;
105         vy_8 += vx_8*v_a;
106         vy_9 += vx_9*v_a;
107         vy_10 += vx_10*v_a;
108         vy_11 += vx_11*v_a;
109         vy_12 += vx_12*v_a;
110         vy_13 += vx_13*v_a;
111         vy_14 += vx_14*v_a;
112         vy_15 += vx_15*v_a;
113 
114     	vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
115         vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
116         vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
117         vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
118         vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
119         vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
120         vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
121 	vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
122     	vec_vsx_st( vy_8, offset_8 ,vptr_y ) ;
123         vec_vsx_st( vy_9, offset_9 ,vptr_y ) ;
124         vec_vsx_st( vy_10, offset_10 ,vptr_y ) ;
125         vec_vsx_st( vy_11, offset_11 ,vptr_y ) ;
126         vec_vsx_st( vy_12, offset_12 ,vptr_y ) ;
127         vec_vsx_st( vy_13, offset_13 ,vptr_y ) ;
128         vec_vsx_st( vy_14, offset_14 ,vptr_y ) ;
129 	vec_vsx_st( vy_15, offset_15 ,vptr_y ) ;
130 
131         vptr_x+=16;
132 	vptr_y+=16;
133 
134 /*
135 
136         v_y[i]    += v_a * v_x[i];
137         v_y[i+1]  += v_a * v_x[i+1];
138         v_y[i+2]  += v_a * v_x[i+2];
139         v_y[i+3]  += v_a * v_x[i+3];
140         v_y[i+4]  += v_a * v_x[i+4];
141         v_y[i+5]  += v_a * v_x[i+5];
142         v_y[i+6]  += v_a * v_x[i+6];
143         v_y[i+7]  += v_a * v_x[i+7];
144         v_y[i+8]  += v_a * v_x[i+8];
145         v_y[i+9]  += v_a * v_x[i+9];
146         v_y[i+10] += v_a * v_x[i+10];
147         v_y[i+11] += v_a * v_x[i+11];
148         v_y[i+12] += v_a * v_x[i+12];
149         v_y[i+13] += v_a * v_x[i+13];
150         v_y[i+14] += v_a * v_x[i+14];
151         v_y[i+15] += v_a * v_x[i+15];
152 */
153     }
154 }
155 #endif
156 #endif
157 
CNAME(BLASLONG n,BLASLONG dummy0,BLASLONG dummy1,FLOAT da,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT * dummy,BLASLONG dummy2)158 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
159 {
160 	BLASLONG i=0;
161 	BLASLONG ix=0,iy=0;
162 
163 	if ( n <= 0 )  return(0);
164 
165 	if ( (inc_x == 1) && (inc_y == 1) )
166 	{
167 
168 		BLASLONG n1 = n & -64;
169 #if defined(__VEC__) || defined(__ALTIVEC__)
170 
171 		if ( n1 )
172 			saxpy_kernel_64(n1, x, y, da);
173 
174 		i = n1;
175 #endif
176 		while(i < n)
177 		{
178 
179 			y[i] += da * x[i] ;
180 			i++ ;
181 
182 		}
183 		return(0);
184 
185 
186 	}
187 
188 	BLASLONG n1 = n & -4;
189 
190 	while(i < n1)
191 	{
192 
193 		FLOAT m1      = da * x[ix] ;
194 		FLOAT m2      = da * x[ix+inc_x] ;
195 		FLOAT m3      = da * x[ix+2*inc_x] ;
196 		FLOAT m4      = da * x[ix+3*inc_x] ;
197 
198 		y[iy]         += m1 ;
199 		y[iy+inc_y]   += m2 ;
200 		y[iy+2*inc_y] += m3 ;
201 		y[iy+3*inc_y] += m4 ;
202 
203 		ix  += inc_x*4 ;
204 		iy  += inc_y*4 ;
205 		i+=4 ;
206 
207 	}
208 
209 	while(i < n)
210 	{
211 
212 		y[iy] += da * x[ix] ;
213 		ix  += inc_x ;
214 		iy  += inc_y ;
215 		i++ ;
216 
217 	}
218 	return(0);
219 
220 }
221 
222 
223 
224