1 /* This file is autogenerated. Do not edit. */
2 /*
3 * LIBOIL - Library of Optimized Inner Loops
4 * Copyright (c) 2005 David A. Schleef <ds.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
20 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #ifdef HAVE_CONFIG_H
30 #include "config.h"
31 #endif
32
33 #include <math.h>
34
35 #include <liboil/liboil.h>
36 #include <liboil/liboilclasses.h>
37
38 static void
add_f32_pointer(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)39 add_f32_pointer (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
40 {
41 while (n) {
42 *dest = *src1 + *src2;
43 dest++;
44 src1++;
45 src2++;
46 n--;
47 }
48 }
49 OIL_DEFINE_IMPL (add_f32_pointer, add_f32);
50
51 static void
add_f32_unroll2(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)52 add_f32_unroll2 (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
53 {
54 int i;
55
56 if (n & 1) {
57 dest[0] = src1[0] + src2[0];
58 dest++;
59 src1++;
60 src2++;
61 n--;
62 }
63 for(i=0;i<n;i+=2){
64 dest[i] = src1[i] + src2[i];
65 dest[i+1] = src1[i+1] + src2[i+1];
66 }
67 }
68 OIL_DEFINE_IMPL (add_f32_unroll2, add_f32);
69
70 static void
add_f32_unroll4a(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)71 add_f32_unroll4a (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
72 {
73 int i;
74
75 while (n & 3) {
76 dest[0] = src1[0] + src2[0];
77 dest++;
78 src1++;
79 src2++;
80 n--;
81 }
82 for(i=0;i<n;i+=4){
83 dest[i] = src1[i] + src2[i];
84 dest[i+1] = src1[i+1] + src2[i+1];
85 dest[i+2] = src1[i+2] + src2[i+2];
86 dest[i+3] = src1[i+3] + src2[i+3];
87 }
88 }
89 OIL_DEFINE_IMPL (add_f32_unroll4a, add_f32);
90
91 static void
add_f32_unroll4b(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)92 add_f32_unroll4b (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
93 {
94 int i;
95
96 for(i=0;i<(n&(~0x3));i+=4){
97 dest[i+0] = src1[i+0] + src2[i+0];
98 dest[i+1] = src1[i+1] + src2[i+1];
99 dest[i+2] = src1[i+2] + src2[i+2];
100 dest[i+3] = src1[i+3] + src2[i+3];
101 }
102 for(;i<n;i++){
103 dest[i] = src1[i] + src2[i];
104 }
105 }
106 OIL_DEFINE_IMPL (add_f32_unroll4b, add_f32);
107
108 static void
add_f32_unroll4c(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)109 add_f32_unroll4c (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
110 {
111 int i;
112
113 for(i=0;i<(n&(~0x3));i+=4){
114 *dest++ = *src1++ + *src2++;
115 *dest++ = *src1++ + *src2++;
116 *dest++ = *src1++ + *src2++;
117 *dest++ = *src1++ + *src2++;
118 }
119 for(;i<n;i++){
120 *dest++ = *src1++ + *src2++;
121 }
122 }
123 OIL_DEFINE_IMPL (add_f32_unroll4c, add_f32);
124
125 static void
add_f64_pointer(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)126 add_f64_pointer (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
127 {
128 while (n) {
129 *dest = *src1 + *src2;
130 dest++;
131 src1++;
132 src2++;
133 n--;
134 }
135 }
136 OIL_DEFINE_IMPL (add_f64_pointer, add_f64);
137
138 static void
add_f64_unroll2(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)139 add_f64_unroll2 (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
140 {
141 int i;
142
143 if (n & 1) {
144 dest[0] = src1[0] + src2[0];
145 dest++;
146 src1++;
147 src2++;
148 n--;
149 }
150 for(i=0;i<n;i+=2){
151 dest[i] = src1[i] + src2[i];
152 dest[i+1] = src1[i+1] + src2[i+1];
153 }
154 }
155 OIL_DEFINE_IMPL (add_f64_unroll2, add_f64);
156
157 static void
add_f64_unroll4a(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)158 add_f64_unroll4a (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
159 {
160 int i;
161
162 while (n & 3) {
163 dest[0] = src1[0] + src2[0];
164 dest++;
165 src1++;
166 src2++;
167 n--;
168 }
169 for(i=0;i<n;i+=4){
170 dest[i] = src1[i] + src2[i];
171 dest[i+1] = src1[i+1] + src2[i+1];
172 dest[i+2] = src1[i+2] + src2[i+2];
173 dest[i+3] = src1[i+3] + src2[i+3];
174 }
175 }
176 OIL_DEFINE_IMPL (add_f64_unroll4a, add_f64);
177
178 static void
add_f64_unroll4b(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)179 add_f64_unroll4b (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
180 {
181 int i;
182
183 for(i=0;i<(n&(~0x3));i+=4){
184 dest[i+0] = src1[i+0] + src2[i+0];
185 dest[i+1] = src1[i+1] + src2[i+1];
186 dest[i+2] = src1[i+2] + src2[i+2];
187 dest[i+3] = src1[i+3] + src2[i+3];
188 }
189 for(;i<n;i++){
190 dest[i] = src1[i] + src2[i];
191 }
192 }
193 OIL_DEFINE_IMPL (add_f64_unroll4b, add_f64);
194
195 static void
add_f64_unroll4c(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)196 add_f64_unroll4c (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
197 {
198 int i;
199
200 for(i=0;i<(n&(~0x3));i+=4){
201 *dest++ = *src1++ + *src2++;
202 *dest++ = *src1++ + *src2++;
203 *dest++ = *src1++ + *src2++;
204 *dest++ = *src1++ + *src2++;
205 }
206 for(;i<n;i++){
207 *dest++ = *src1++ + *src2++;
208 }
209 }
210 OIL_DEFINE_IMPL (add_f64_unroll4c, add_f64);
211
212 static void
subtract_f32_pointer(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)213 subtract_f32_pointer (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
214 {
215 while (n) {
216 *dest = *src1 - *src2;
217 dest++;
218 src1++;
219 src2++;
220 n--;
221 }
222 }
223 OIL_DEFINE_IMPL (subtract_f32_pointer, subtract_f32);
224
225 static void
subtract_f32_unroll2(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)226 subtract_f32_unroll2 (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
227 {
228 int i;
229
230 if (n & 1) {
231 dest[0] = src1[0] - src2[0];
232 dest++;
233 src1++;
234 src2++;
235 n--;
236 }
237 for(i=0;i<n;i+=2){
238 dest[i] = src1[i] - src2[i];
239 dest[i+1] = src1[i+1] - src2[i+1];
240 }
241 }
242 OIL_DEFINE_IMPL (subtract_f32_unroll2, subtract_f32);
243
244 static void
subtract_f32_unroll4a(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)245 subtract_f32_unroll4a (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
246 {
247 int i;
248
249 while (n & 3) {
250 dest[0] = src1[0] - src2[0];
251 dest++;
252 src1++;
253 src2++;
254 n--;
255 }
256 for(i=0;i<n;i+=4){
257 dest[i] = src1[i] - src2[i];
258 dest[i+1] = src1[i+1] - src2[i+1];
259 dest[i+2] = src1[i+2] - src2[i+2];
260 dest[i+3] = src1[i+3] - src2[i+3];
261 }
262 }
263 OIL_DEFINE_IMPL (subtract_f32_unroll4a, subtract_f32);
264
265 static void
subtract_f32_unroll4b(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)266 subtract_f32_unroll4b (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
267 {
268 int i;
269
270 for(i=0;i<(n&(~0x3));i+=4){
271 dest[i+0] = src1[i+0] - src2[i+0];
272 dest[i+1] = src1[i+1] - src2[i+1];
273 dest[i+2] = src1[i+2] - src2[i+2];
274 dest[i+3] = src1[i+3] - src2[i+3];
275 }
276 for(;i<n;i++){
277 dest[i] = src1[i] - src2[i];
278 }
279 }
280 OIL_DEFINE_IMPL (subtract_f32_unroll4b, subtract_f32);
281
282 static void
subtract_f32_unroll4c(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)283 subtract_f32_unroll4c (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
284 {
285 int i;
286
287 for(i=0;i<(n&(~0x3));i+=4){
288 *dest++ = *src1++ - *src2++;
289 *dest++ = *src1++ - *src2++;
290 *dest++ = *src1++ - *src2++;
291 *dest++ = *src1++ - *src2++;
292 }
293 for(;i<n;i++){
294 *dest++ = *src1++ - *src2++;
295 }
296 }
297 OIL_DEFINE_IMPL (subtract_f32_unroll4c, subtract_f32);
298
299 static void
subtract_f64_pointer(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)300 subtract_f64_pointer (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
301 {
302 while (n) {
303 *dest = *src1 - *src2;
304 dest++;
305 src1++;
306 src2++;
307 n--;
308 }
309 }
310 OIL_DEFINE_IMPL (subtract_f64_pointer, subtract_f64);
311
312 static void
subtract_f64_unroll2(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)313 subtract_f64_unroll2 (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
314 {
315 int i;
316
317 if (n & 1) {
318 dest[0] = src1[0] - src2[0];
319 dest++;
320 src1++;
321 src2++;
322 n--;
323 }
324 for(i=0;i<n;i+=2){
325 dest[i] = src1[i] - src2[i];
326 dest[i+1] = src1[i+1] - src2[i+1];
327 }
328 }
329 OIL_DEFINE_IMPL (subtract_f64_unroll2, subtract_f64);
330
331 static void
subtract_f64_unroll4a(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)332 subtract_f64_unroll4a (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
333 {
334 int i;
335
336 while (n & 3) {
337 dest[0] = src1[0] - src2[0];
338 dest++;
339 src1++;
340 src2++;
341 n--;
342 }
343 for(i=0;i<n;i+=4){
344 dest[i] = src1[i] - src2[i];
345 dest[i+1] = src1[i+1] - src2[i+1];
346 dest[i+2] = src1[i+2] - src2[i+2];
347 dest[i+3] = src1[i+3] - src2[i+3];
348 }
349 }
350 OIL_DEFINE_IMPL (subtract_f64_unroll4a, subtract_f64);
351
352 static void
subtract_f64_unroll4b(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)353 subtract_f64_unroll4b (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
354 {
355 int i;
356
357 for(i=0;i<(n&(~0x3));i+=4){
358 dest[i+0] = src1[i+0] - src2[i+0];
359 dest[i+1] = src1[i+1] - src2[i+1];
360 dest[i+2] = src1[i+2] - src2[i+2];
361 dest[i+3] = src1[i+3] - src2[i+3];
362 }
363 for(;i<n;i++){
364 dest[i] = src1[i] - src2[i];
365 }
366 }
367 OIL_DEFINE_IMPL (subtract_f64_unroll4b, subtract_f64);
368
369 static void
subtract_f64_unroll4c(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)370 subtract_f64_unroll4c (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
371 {
372 int i;
373
374 for(i=0;i<(n&(~0x3));i+=4){
375 *dest++ = *src1++ - *src2++;
376 *dest++ = *src1++ - *src2++;
377 *dest++ = *src1++ - *src2++;
378 *dest++ = *src1++ - *src2++;
379 }
380 for(;i<n;i++){
381 *dest++ = *src1++ - *src2++;
382 }
383 }
384 OIL_DEFINE_IMPL (subtract_f64_unroll4c, subtract_f64);
385
386 static void
divide_f32_pointer(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)387 divide_f32_pointer (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
388 {
389 while (n) {
390 *dest = *src1 / *src2;
391 dest++;
392 src1++;
393 src2++;
394 n--;
395 }
396 }
397 OIL_DEFINE_IMPL (divide_f32_pointer, divide_f32);
398
399 static void
divide_f32_unroll2(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)400 divide_f32_unroll2 (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
401 {
402 int i;
403
404 if (n & 1) {
405 dest[0] = src1[0] / src2[0];
406 dest++;
407 src1++;
408 src2++;
409 n--;
410 }
411 for(i=0;i<n;i+=2){
412 dest[i] = src1[i] / src2[i];
413 dest[i+1] = src1[i+1] / src2[i+1];
414 }
415 }
416 OIL_DEFINE_IMPL (divide_f32_unroll2, divide_f32);
417
418 static void
divide_f32_unroll4a(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)419 divide_f32_unroll4a (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
420 {
421 int i;
422
423 while (n & 3) {
424 dest[0] = src1[0] / src2[0];
425 dest++;
426 src1++;
427 src2++;
428 n--;
429 }
430 for(i=0;i<n;i+=4){
431 dest[i] = src1[i] / src2[i];
432 dest[i+1] = src1[i+1] / src2[i+1];
433 dest[i+2] = src1[i+2] / src2[i+2];
434 dest[i+3] = src1[i+3] / src2[i+3];
435 }
436 }
437 OIL_DEFINE_IMPL (divide_f32_unroll4a, divide_f32);
438
439 static void
divide_f32_unroll4b(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)440 divide_f32_unroll4b (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
441 {
442 int i;
443
444 for(i=0;i<(n&(~0x3));i+=4){
445 dest[i+0] = src1[i+0] / src2[i+0];
446 dest[i+1] = src1[i+1] / src2[i+1];
447 dest[i+2] = src1[i+2] / src2[i+2];
448 dest[i+3] = src1[i+3] / src2[i+3];
449 }
450 for(;i<n;i++){
451 dest[i] = src1[i] / src2[i];
452 }
453 }
454 OIL_DEFINE_IMPL (divide_f32_unroll4b, divide_f32);
455
456 static void
divide_f32_unroll4c(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)457 divide_f32_unroll4c (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
458 {
459 int i;
460
461 for(i=0;i<(n&(~0x3));i+=4){
462 *dest++ = *src1++ / *src2++;
463 *dest++ = *src1++ / *src2++;
464 *dest++ = *src1++ / *src2++;
465 *dest++ = *src1++ / *src2++;
466 }
467 for(;i<n;i++){
468 *dest++ = *src1++ / *src2++;
469 }
470 }
471 OIL_DEFINE_IMPL (divide_f32_unroll4c, divide_f32);
472
473 static void
divide_f64_pointer(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)474 divide_f64_pointer (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
475 {
476 while (n) {
477 *dest = *src1 / *src2;
478 dest++;
479 src1++;
480 src2++;
481 n--;
482 }
483 }
484 OIL_DEFINE_IMPL (divide_f64_pointer, divide_f64);
485
486 static void
divide_f64_unroll2(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)487 divide_f64_unroll2 (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
488 {
489 int i;
490
491 if (n & 1) {
492 dest[0] = src1[0] / src2[0];
493 dest++;
494 src1++;
495 src2++;
496 n--;
497 }
498 for(i=0;i<n;i+=2){
499 dest[i] = src1[i] / src2[i];
500 dest[i+1] = src1[i+1] / src2[i+1];
501 }
502 }
503 OIL_DEFINE_IMPL (divide_f64_unroll2, divide_f64);
504
505 static void
divide_f64_unroll4a(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)506 divide_f64_unroll4a (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
507 {
508 int i;
509
510 while (n & 3) {
511 dest[0] = src1[0] / src2[0];
512 dest++;
513 src1++;
514 src2++;
515 n--;
516 }
517 for(i=0;i<n;i+=4){
518 dest[i] = src1[i] / src2[i];
519 dest[i+1] = src1[i+1] / src2[i+1];
520 dest[i+2] = src1[i+2] / src2[i+2];
521 dest[i+3] = src1[i+3] / src2[i+3];
522 }
523 }
524 OIL_DEFINE_IMPL (divide_f64_unroll4a, divide_f64);
525
526 static void
divide_f64_unroll4b(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)527 divide_f64_unroll4b (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
528 {
529 int i;
530
531 for(i=0;i<(n&(~0x3));i+=4){
532 dest[i+0] = src1[i+0] / src2[i+0];
533 dest[i+1] = src1[i+1] / src2[i+1];
534 dest[i+2] = src1[i+2] / src2[i+2];
535 dest[i+3] = src1[i+3] / src2[i+3];
536 }
537 for(;i<n;i++){
538 dest[i] = src1[i] / src2[i];
539 }
540 }
541 OIL_DEFINE_IMPL (divide_f64_unroll4b, divide_f64);
542
543 static void
divide_f64_unroll4c(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)544 divide_f64_unroll4c (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
545 {
546 int i;
547
548 for(i=0;i<(n&(~0x3));i+=4){
549 *dest++ = *src1++ / *src2++;
550 *dest++ = *src1++ / *src2++;
551 *dest++ = *src1++ / *src2++;
552 *dest++ = *src1++ / *src2++;
553 }
554 for(;i<n;i++){
555 *dest++ = *src1++ / *src2++;
556 }
557 }
558 OIL_DEFINE_IMPL (divide_f64_unroll4c, divide_f64);
559
560 static void
multiply_f32_pointer(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)561 multiply_f32_pointer (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
562 {
563 while (n) {
564 *dest = *src1 * *src2;
565 dest++;
566 src1++;
567 src2++;
568 n--;
569 }
570 }
571 OIL_DEFINE_IMPL (multiply_f32_pointer, multiply_f32);
572
573 static void
multiply_f32_unroll2(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)574 multiply_f32_unroll2 (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
575 {
576 int i;
577
578 if (n & 1) {
579 dest[0] = src1[0] * src2[0];
580 dest++;
581 src1++;
582 src2++;
583 n--;
584 }
585 for(i=0;i<n;i+=2){
586 dest[i] = src1[i] * src2[i];
587 dest[i+1] = src1[i+1] * src2[i+1];
588 }
589 }
590 OIL_DEFINE_IMPL (multiply_f32_unroll2, multiply_f32);
591
592 static void
multiply_f32_unroll4a(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)593 multiply_f32_unroll4a (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
594 {
595 int i;
596
597 while (n & 3) {
598 dest[0] = src1[0] * src2[0];
599 dest++;
600 src1++;
601 src2++;
602 n--;
603 }
604 for(i=0;i<n;i+=4){
605 dest[i] = src1[i] * src2[i];
606 dest[i+1] = src1[i+1] * src2[i+1];
607 dest[i+2] = src1[i+2] * src2[i+2];
608 dest[i+3] = src1[i+3] * src2[i+3];
609 }
610 }
611 OIL_DEFINE_IMPL (multiply_f32_unroll4a, multiply_f32);
612
613 static void
multiply_f32_unroll4b(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)614 multiply_f32_unroll4b (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
615 {
616 int i;
617
618 for(i=0;i<(n&(~0x3));i+=4){
619 dest[i+0] = src1[i+0] * src2[i+0];
620 dest[i+1] = src1[i+1] * src2[i+1];
621 dest[i+2] = src1[i+2] * src2[i+2];
622 dest[i+3] = src1[i+3] * src2[i+3];
623 }
624 for(;i<n;i++){
625 dest[i] = src1[i] * src2[i];
626 }
627 }
628 OIL_DEFINE_IMPL (multiply_f32_unroll4b, multiply_f32);
629
630 static void
multiply_f32_unroll4c(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)631 multiply_f32_unroll4c (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
632 {
633 int i;
634
635 for(i=0;i<(n&(~0x3));i+=4){
636 *dest++ = *src1++ * *src2++;
637 *dest++ = *src1++ * *src2++;
638 *dest++ = *src1++ * *src2++;
639 *dest++ = *src1++ * *src2++;
640 }
641 for(;i<n;i++){
642 *dest++ = *src1++ * *src2++;
643 }
644 }
645 OIL_DEFINE_IMPL (multiply_f32_unroll4c, multiply_f32);
646
647 static void
multiply_f64_pointer(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)648 multiply_f64_pointer (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
649 {
650 while (n) {
651 *dest = *src1 * *src2;
652 dest++;
653 src1++;
654 src2++;
655 n--;
656 }
657 }
658 OIL_DEFINE_IMPL (multiply_f64_pointer, multiply_f64);
659
660 static void
multiply_f64_unroll2(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)661 multiply_f64_unroll2 (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
662 {
663 int i;
664
665 if (n & 1) {
666 dest[0] = src1[0] * src2[0];
667 dest++;
668 src1++;
669 src2++;
670 n--;
671 }
672 for(i=0;i<n;i+=2){
673 dest[i] = src1[i] * src2[i];
674 dest[i+1] = src1[i+1] * src2[i+1];
675 }
676 }
677 OIL_DEFINE_IMPL (multiply_f64_unroll2, multiply_f64);
678
679 static void
multiply_f64_unroll4a(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)680 multiply_f64_unroll4a (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
681 {
682 int i;
683
684 while (n & 3) {
685 dest[0] = src1[0] * src2[0];
686 dest++;
687 src1++;
688 src2++;
689 n--;
690 }
691 for(i=0;i<n;i+=4){
692 dest[i] = src1[i] * src2[i];
693 dest[i+1] = src1[i+1] * src2[i+1];
694 dest[i+2] = src1[i+2] * src2[i+2];
695 dest[i+3] = src1[i+3] * src2[i+3];
696 }
697 }
698 OIL_DEFINE_IMPL (multiply_f64_unroll4a, multiply_f64);
699
700 static void
multiply_f64_unroll4b(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)701 multiply_f64_unroll4b (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
702 {
703 int i;
704
705 for(i=0;i<(n&(~0x3));i+=4){
706 dest[i+0] = src1[i+0] * src2[i+0];
707 dest[i+1] = src1[i+1] * src2[i+1];
708 dest[i+2] = src1[i+2] * src2[i+2];
709 dest[i+3] = src1[i+3] * src2[i+3];
710 }
711 for(;i<n;i++){
712 dest[i] = src1[i] * src2[i];
713 }
714 }
715 OIL_DEFINE_IMPL (multiply_f64_unroll4b, multiply_f64);
716
717 static void
multiply_f64_unroll4c(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)718 multiply_f64_unroll4c (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
719 {
720 int i;
721
722 for(i=0;i<(n&(~0x3));i+=4){
723 *dest++ = *src1++ * *src2++;
724 *dest++ = *src1++ * *src2++;
725 *dest++ = *src1++ * *src2++;
726 *dest++ = *src1++ * *src2++;
727 }
728 for(;i<n;i++){
729 *dest++ = *src1++ * *src2++;
730 }
731 }
732 OIL_DEFINE_IMPL (multiply_f64_unroll4c, multiply_f64);
733
734