1 /* This file is autogenerated.  Do not edit. */
2 /*
3  * LIBOIL - Library of Optimized Inner Loops
4  * Copyright (c) 2005 David A. Schleef <ds.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #ifdef HAVE_CONFIG_H
30 #include "config.h"
31 #endif
32 
33 #include <math.h>
34 
35 #include <liboil/liboil.h>
36 #include <liboil/liboilclasses.h>
37 
38 static void
add_f32_pointer(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)39 add_f32_pointer (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
40 {
41   while (n) {
42     *dest = *src1 + *src2;
43     dest++;
44     src1++;
45     src2++;
46     n--;
47   }
48 }
49 OIL_DEFINE_IMPL (add_f32_pointer, add_f32);
50 
51 static void
add_f32_unroll2(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)52 add_f32_unroll2 (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
53 {
54   int i;
55 
56   if (n & 1) {
57     dest[0] = src1[0] + src2[0];
58     dest++;
59     src1++;
60     src2++;
61     n--;
62   }
63   for(i=0;i<n;i+=2){
64     dest[i] = src1[i] + src2[i];
65     dest[i+1] = src1[i+1] + src2[i+1];
66   }
67 }
68 OIL_DEFINE_IMPL (add_f32_unroll2, add_f32);
69 
70 static void
add_f32_unroll4a(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)71 add_f32_unroll4a (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
72 {
73   int i;
74 
75   while (n & 3) {
76     dest[0] = src1[0] + src2[0];
77     dest++;
78     src1++;
79     src2++;
80     n--;
81   }
82   for(i=0;i<n;i+=4){
83     dest[i] = src1[i] + src2[i];
84     dest[i+1] = src1[i+1] + src2[i+1];
85     dest[i+2] = src1[i+2] + src2[i+2];
86     dest[i+3] = src1[i+3] + src2[i+3];
87   }
88 }
89 OIL_DEFINE_IMPL (add_f32_unroll4a, add_f32);
90 
91 static void
add_f32_unroll4b(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)92 add_f32_unroll4b (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
93 {
94   int i;
95 
96   for(i=0;i<(n&(~0x3));i+=4){
97     dest[i+0] = src1[i+0] + src2[i+0];
98     dest[i+1] = src1[i+1] + src2[i+1];
99     dest[i+2] = src1[i+2] + src2[i+2];
100     dest[i+3] = src1[i+3] + src2[i+3];
101   }
102   for(;i<n;i++){
103     dest[i] = src1[i] + src2[i];
104   }
105 }
106 OIL_DEFINE_IMPL (add_f32_unroll4b, add_f32);
107 
108 static void
add_f32_unroll4c(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)109 add_f32_unroll4c (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
110 {
111   int i;
112 
113   for(i=0;i<(n&(~0x3));i+=4){
114     *dest++ = *src1++ + *src2++;
115     *dest++ = *src1++ + *src2++;
116     *dest++ = *src1++ + *src2++;
117     *dest++ = *src1++ + *src2++;
118   }
119   for(;i<n;i++){
120     *dest++ = *src1++ + *src2++;
121   }
122 }
123 OIL_DEFINE_IMPL (add_f32_unroll4c, add_f32);
124 
125 static void
add_f64_pointer(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)126 add_f64_pointer (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
127 {
128   while (n) {
129     *dest = *src1 + *src2;
130     dest++;
131     src1++;
132     src2++;
133     n--;
134   }
135 }
136 OIL_DEFINE_IMPL (add_f64_pointer, add_f64);
137 
138 static void
add_f64_unroll2(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)139 add_f64_unroll2 (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
140 {
141   int i;
142 
143   if (n & 1) {
144     dest[0] = src1[0] + src2[0];
145     dest++;
146     src1++;
147     src2++;
148     n--;
149   }
150   for(i=0;i<n;i+=2){
151     dest[i] = src1[i] + src2[i];
152     dest[i+1] = src1[i+1] + src2[i+1];
153   }
154 }
155 OIL_DEFINE_IMPL (add_f64_unroll2, add_f64);
156 
157 static void
add_f64_unroll4a(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)158 add_f64_unroll4a (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
159 {
160   int i;
161 
162   while (n & 3) {
163     dest[0] = src1[0] + src2[0];
164     dest++;
165     src1++;
166     src2++;
167     n--;
168   }
169   for(i=0;i<n;i+=4){
170     dest[i] = src1[i] + src2[i];
171     dest[i+1] = src1[i+1] + src2[i+1];
172     dest[i+2] = src1[i+2] + src2[i+2];
173     dest[i+3] = src1[i+3] + src2[i+3];
174   }
175 }
176 OIL_DEFINE_IMPL (add_f64_unroll4a, add_f64);
177 
178 static void
add_f64_unroll4b(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)179 add_f64_unroll4b (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
180 {
181   int i;
182 
183   for(i=0;i<(n&(~0x3));i+=4){
184     dest[i+0] = src1[i+0] + src2[i+0];
185     dest[i+1] = src1[i+1] + src2[i+1];
186     dest[i+2] = src1[i+2] + src2[i+2];
187     dest[i+3] = src1[i+3] + src2[i+3];
188   }
189   for(;i<n;i++){
190     dest[i] = src1[i] + src2[i];
191   }
192 }
193 OIL_DEFINE_IMPL (add_f64_unroll4b, add_f64);
194 
195 static void
add_f64_unroll4c(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)196 add_f64_unroll4c (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
197 {
198   int i;
199 
200   for(i=0;i<(n&(~0x3));i+=4){
201     *dest++ = *src1++ + *src2++;
202     *dest++ = *src1++ + *src2++;
203     *dest++ = *src1++ + *src2++;
204     *dest++ = *src1++ + *src2++;
205   }
206   for(;i<n;i++){
207     *dest++ = *src1++ + *src2++;
208   }
209 }
210 OIL_DEFINE_IMPL (add_f64_unroll4c, add_f64);
211 
212 static void
subtract_f32_pointer(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)213 subtract_f32_pointer (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
214 {
215   while (n) {
216     *dest = *src1 - *src2;
217     dest++;
218     src1++;
219     src2++;
220     n--;
221   }
222 }
223 OIL_DEFINE_IMPL (subtract_f32_pointer, subtract_f32);
224 
225 static void
subtract_f32_unroll2(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)226 subtract_f32_unroll2 (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
227 {
228   int i;
229 
230   if (n & 1) {
231     dest[0] = src1[0] - src2[0];
232     dest++;
233     src1++;
234     src2++;
235     n--;
236   }
237   for(i=0;i<n;i+=2){
238     dest[i] = src1[i] - src2[i];
239     dest[i+1] = src1[i+1] - src2[i+1];
240   }
241 }
242 OIL_DEFINE_IMPL (subtract_f32_unroll2, subtract_f32);
243 
244 static void
subtract_f32_unroll4a(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)245 subtract_f32_unroll4a (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
246 {
247   int i;
248 
249   while (n & 3) {
250     dest[0] = src1[0] - src2[0];
251     dest++;
252     src1++;
253     src2++;
254     n--;
255   }
256   for(i=0;i<n;i+=4){
257     dest[i] = src1[i] - src2[i];
258     dest[i+1] = src1[i+1] - src2[i+1];
259     dest[i+2] = src1[i+2] - src2[i+2];
260     dest[i+3] = src1[i+3] - src2[i+3];
261   }
262 }
263 OIL_DEFINE_IMPL (subtract_f32_unroll4a, subtract_f32);
264 
265 static void
subtract_f32_unroll4b(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)266 subtract_f32_unroll4b (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
267 {
268   int i;
269 
270   for(i=0;i<(n&(~0x3));i+=4){
271     dest[i+0] = src1[i+0] - src2[i+0];
272     dest[i+1] = src1[i+1] - src2[i+1];
273     dest[i+2] = src1[i+2] - src2[i+2];
274     dest[i+3] = src1[i+3] - src2[i+3];
275   }
276   for(;i<n;i++){
277     dest[i] = src1[i] - src2[i];
278   }
279 }
280 OIL_DEFINE_IMPL (subtract_f32_unroll4b, subtract_f32);
281 
282 static void
subtract_f32_unroll4c(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)283 subtract_f32_unroll4c (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
284 {
285   int i;
286 
287   for(i=0;i<(n&(~0x3));i+=4){
288     *dest++ = *src1++ - *src2++;
289     *dest++ = *src1++ - *src2++;
290     *dest++ = *src1++ - *src2++;
291     *dest++ = *src1++ - *src2++;
292   }
293   for(;i<n;i++){
294     *dest++ = *src1++ - *src2++;
295   }
296 }
297 OIL_DEFINE_IMPL (subtract_f32_unroll4c, subtract_f32);
298 
299 static void
subtract_f64_pointer(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)300 subtract_f64_pointer (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
301 {
302   while (n) {
303     *dest = *src1 - *src2;
304     dest++;
305     src1++;
306     src2++;
307     n--;
308   }
309 }
310 OIL_DEFINE_IMPL (subtract_f64_pointer, subtract_f64);
311 
312 static void
subtract_f64_unroll2(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)313 subtract_f64_unroll2 (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
314 {
315   int i;
316 
317   if (n & 1) {
318     dest[0] = src1[0] - src2[0];
319     dest++;
320     src1++;
321     src2++;
322     n--;
323   }
324   for(i=0;i<n;i+=2){
325     dest[i] = src1[i] - src2[i];
326     dest[i+1] = src1[i+1] - src2[i+1];
327   }
328 }
329 OIL_DEFINE_IMPL (subtract_f64_unroll2, subtract_f64);
330 
331 static void
subtract_f64_unroll4a(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)332 subtract_f64_unroll4a (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
333 {
334   int i;
335 
336   while (n & 3) {
337     dest[0] = src1[0] - src2[0];
338     dest++;
339     src1++;
340     src2++;
341     n--;
342   }
343   for(i=0;i<n;i+=4){
344     dest[i] = src1[i] - src2[i];
345     dest[i+1] = src1[i+1] - src2[i+1];
346     dest[i+2] = src1[i+2] - src2[i+2];
347     dest[i+3] = src1[i+3] - src2[i+3];
348   }
349 }
350 OIL_DEFINE_IMPL (subtract_f64_unroll4a, subtract_f64);
351 
352 static void
subtract_f64_unroll4b(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)353 subtract_f64_unroll4b (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
354 {
355   int i;
356 
357   for(i=0;i<(n&(~0x3));i+=4){
358     dest[i+0] = src1[i+0] - src2[i+0];
359     dest[i+1] = src1[i+1] - src2[i+1];
360     dest[i+2] = src1[i+2] - src2[i+2];
361     dest[i+3] = src1[i+3] - src2[i+3];
362   }
363   for(;i<n;i++){
364     dest[i] = src1[i] - src2[i];
365   }
366 }
367 OIL_DEFINE_IMPL (subtract_f64_unroll4b, subtract_f64);
368 
369 static void
subtract_f64_unroll4c(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)370 subtract_f64_unroll4c (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
371 {
372   int i;
373 
374   for(i=0;i<(n&(~0x3));i+=4){
375     *dest++ = *src1++ - *src2++;
376     *dest++ = *src1++ - *src2++;
377     *dest++ = *src1++ - *src2++;
378     *dest++ = *src1++ - *src2++;
379   }
380   for(;i<n;i++){
381     *dest++ = *src1++ - *src2++;
382   }
383 }
384 OIL_DEFINE_IMPL (subtract_f64_unroll4c, subtract_f64);
385 
386 static void
divide_f32_pointer(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)387 divide_f32_pointer (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
388 {
389   while (n) {
390     *dest = *src1 / *src2;
391     dest++;
392     src1++;
393     src2++;
394     n--;
395   }
396 }
397 OIL_DEFINE_IMPL (divide_f32_pointer, divide_f32);
398 
399 static void
divide_f32_unroll2(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)400 divide_f32_unroll2 (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
401 {
402   int i;
403 
404   if (n & 1) {
405     dest[0] = src1[0] / src2[0];
406     dest++;
407     src1++;
408     src2++;
409     n--;
410   }
411   for(i=0;i<n;i+=2){
412     dest[i] = src1[i] / src2[i];
413     dest[i+1] = src1[i+1] / src2[i+1];
414   }
415 }
416 OIL_DEFINE_IMPL (divide_f32_unroll2, divide_f32);
417 
418 static void
divide_f32_unroll4a(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)419 divide_f32_unroll4a (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
420 {
421   int i;
422 
423   while (n & 3) {
424     dest[0] = src1[0] / src2[0];
425     dest++;
426     src1++;
427     src2++;
428     n--;
429   }
430   for(i=0;i<n;i+=4){
431     dest[i] = src1[i] / src2[i];
432     dest[i+1] = src1[i+1] / src2[i+1];
433     dest[i+2] = src1[i+2] / src2[i+2];
434     dest[i+3] = src1[i+3] / src2[i+3];
435   }
436 }
437 OIL_DEFINE_IMPL (divide_f32_unroll4a, divide_f32);
438 
439 static void
divide_f32_unroll4b(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)440 divide_f32_unroll4b (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
441 {
442   int i;
443 
444   for(i=0;i<(n&(~0x3));i+=4){
445     dest[i+0] = src1[i+0] / src2[i+0];
446     dest[i+1] = src1[i+1] / src2[i+1];
447     dest[i+2] = src1[i+2] / src2[i+2];
448     dest[i+3] = src1[i+3] / src2[i+3];
449   }
450   for(;i<n;i++){
451     dest[i] = src1[i] / src2[i];
452   }
453 }
454 OIL_DEFINE_IMPL (divide_f32_unroll4b, divide_f32);
455 
456 static void
divide_f32_unroll4c(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)457 divide_f32_unroll4c (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
458 {
459   int i;
460 
461   for(i=0;i<(n&(~0x3));i+=4){
462     *dest++ = *src1++ / *src2++;
463     *dest++ = *src1++ / *src2++;
464     *dest++ = *src1++ / *src2++;
465     *dest++ = *src1++ / *src2++;
466   }
467   for(;i<n;i++){
468     *dest++ = *src1++ / *src2++;
469   }
470 }
471 OIL_DEFINE_IMPL (divide_f32_unroll4c, divide_f32);
472 
473 static void
divide_f64_pointer(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)474 divide_f64_pointer (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
475 {
476   while (n) {
477     *dest = *src1 / *src2;
478     dest++;
479     src1++;
480     src2++;
481     n--;
482   }
483 }
484 OIL_DEFINE_IMPL (divide_f64_pointer, divide_f64);
485 
486 static void
divide_f64_unroll2(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)487 divide_f64_unroll2 (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
488 {
489   int i;
490 
491   if (n & 1) {
492     dest[0] = src1[0] / src2[0];
493     dest++;
494     src1++;
495     src2++;
496     n--;
497   }
498   for(i=0;i<n;i+=2){
499     dest[i] = src1[i] / src2[i];
500     dest[i+1] = src1[i+1] / src2[i+1];
501   }
502 }
503 OIL_DEFINE_IMPL (divide_f64_unroll2, divide_f64);
504 
505 static void
divide_f64_unroll4a(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)506 divide_f64_unroll4a (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
507 {
508   int i;
509 
510   while (n & 3) {
511     dest[0] = src1[0] / src2[0];
512     dest++;
513     src1++;
514     src2++;
515     n--;
516   }
517   for(i=0;i<n;i+=4){
518     dest[i] = src1[i] / src2[i];
519     dest[i+1] = src1[i+1] / src2[i+1];
520     dest[i+2] = src1[i+2] / src2[i+2];
521     dest[i+3] = src1[i+3] / src2[i+3];
522   }
523 }
524 OIL_DEFINE_IMPL (divide_f64_unroll4a, divide_f64);
525 
526 static void
divide_f64_unroll4b(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)527 divide_f64_unroll4b (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
528 {
529   int i;
530 
531   for(i=0;i<(n&(~0x3));i+=4){
532     dest[i+0] = src1[i+0] / src2[i+0];
533     dest[i+1] = src1[i+1] / src2[i+1];
534     dest[i+2] = src1[i+2] / src2[i+2];
535     dest[i+3] = src1[i+3] / src2[i+3];
536   }
537   for(;i<n;i++){
538     dest[i] = src1[i] / src2[i];
539   }
540 }
541 OIL_DEFINE_IMPL (divide_f64_unroll4b, divide_f64);
542 
543 static void
divide_f64_unroll4c(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)544 divide_f64_unroll4c (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
545 {
546   int i;
547 
548   for(i=0;i<(n&(~0x3));i+=4){
549     *dest++ = *src1++ / *src2++;
550     *dest++ = *src1++ / *src2++;
551     *dest++ = *src1++ / *src2++;
552     *dest++ = *src1++ / *src2++;
553   }
554   for(;i<n;i++){
555     *dest++ = *src1++ / *src2++;
556   }
557 }
558 OIL_DEFINE_IMPL (divide_f64_unroll4c, divide_f64);
559 
560 static void
multiply_f32_pointer(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)561 multiply_f32_pointer (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
562 {
563   while (n) {
564     *dest = *src1 * *src2;
565     dest++;
566     src1++;
567     src2++;
568     n--;
569   }
570 }
571 OIL_DEFINE_IMPL (multiply_f32_pointer, multiply_f32);
572 
573 static void
multiply_f32_unroll2(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)574 multiply_f32_unroll2 (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
575 {
576   int i;
577 
578   if (n & 1) {
579     dest[0] = src1[0] * src2[0];
580     dest++;
581     src1++;
582     src2++;
583     n--;
584   }
585   for(i=0;i<n;i+=2){
586     dest[i] = src1[i] * src2[i];
587     dest[i+1] = src1[i+1] * src2[i+1];
588   }
589 }
590 OIL_DEFINE_IMPL (multiply_f32_unroll2, multiply_f32);
591 
592 static void
multiply_f32_unroll4a(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)593 multiply_f32_unroll4a (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
594 {
595   int i;
596 
597   while (n & 3) {
598     dest[0] = src1[0] * src2[0];
599     dest++;
600     src1++;
601     src2++;
602     n--;
603   }
604   for(i=0;i<n;i+=4){
605     dest[i] = src1[i] * src2[i];
606     dest[i+1] = src1[i+1] * src2[i+1];
607     dest[i+2] = src1[i+2] * src2[i+2];
608     dest[i+3] = src1[i+3] * src2[i+3];
609   }
610 }
611 OIL_DEFINE_IMPL (multiply_f32_unroll4a, multiply_f32);
612 
613 static void
multiply_f32_unroll4b(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)614 multiply_f32_unroll4b (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
615 {
616   int i;
617 
618   for(i=0;i<(n&(~0x3));i+=4){
619     dest[i+0] = src1[i+0] * src2[i+0];
620     dest[i+1] = src1[i+1] * src2[i+1];
621     dest[i+2] = src1[i+2] * src2[i+2];
622     dest[i+3] = src1[i+3] * src2[i+3];
623   }
624   for(;i<n;i++){
625     dest[i] = src1[i] * src2[i];
626   }
627 }
628 OIL_DEFINE_IMPL (multiply_f32_unroll4b, multiply_f32);
629 
630 static void
multiply_f32_unroll4c(oil_type_f32 * dest,oil_type_f32 * src1,oil_type_f32 * src2,int n)631 multiply_f32_unroll4c (oil_type_f32 *dest, oil_type_f32 *src1, oil_type_f32 *src2, int n)
632 {
633   int i;
634 
635   for(i=0;i<(n&(~0x3));i+=4){
636     *dest++ = *src1++ * *src2++;
637     *dest++ = *src1++ * *src2++;
638     *dest++ = *src1++ * *src2++;
639     *dest++ = *src1++ * *src2++;
640   }
641   for(;i<n;i++){
642     *dest++ = *src1++ * *src2++;
643   }
644 }
645 OIL_DEFINE_IMPL (multiply_f32_unroll4c, multiply_f32);
646 
647 static void
multiply_f64_pointer(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)648 multiply_f64_pointer (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
649 {
650   while (n) {
651     *dest = *src1 * *src2;
652     dest++;
653     src1++;
654     src2++;
655     n--;
656   }
657 }
658 OIL_DEFINE_IMPL (multiply_f64_pointer, multiply_f64);
659 
660 static void
multiply_f64_unroll2(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)661 multiply_f64_unroll2 (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
662 {
663   int i;
664 
665   if (n & 1) {
666     dest[0] = src1[0] * src2[0];
667     dest++;
668     src1++;
669     src2++;
670     n--;
671   }
672   for(i=0;i<n;i+=2){
673     dest[i] = src1[i] * src2[i];
674     dest[i+1] = src1[i+1] * src2[i+1];
675   }
676 }
677 OIL_DEFINE_IMPL (multiply_f64_unroll2, multiply_f64);
678 
679 static void
multiply_f64_unroll4a(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)680 multiply_f64_unroll4a (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
681 {
682   int i;
683 
684   while (n & 3) {
685     dest[0] = src1[0] * src2[0];
686     dest++;
687     src1++;
688     src2++;
689     n--;
690   }
691   for(i=0;i<n;i+=4){
692     dest[i] = src1[i] * src2[i];
693     dest[i+1] = src1[i+1] * src2[i+1];
694     dest[i+2] = src1[i+2] * src2[i+2];
695     dest[i+3] = src1[i+3] * src2[i+3];
696   }
697 }
698 OIL_DEFINE_IMPL (multiply_f64_unroll4a, multiply_f64);
699 
700 static void
multiply_f64_unroll4b(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)701 multiply_f64_unroll4b (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
702 {
703   int i;
704 
705   for(i=0;i<(n&(~0x3));i+=4){
706     dest[i+0] = src1[i+0] * src2[i+0];
707     dest[i+1] = src1[i+1] * src2[i+1];
708     dest[i+2] = src1[i+2] * src2[i+2];
709     dest[i+3] = src1[i+3] * src2[i+3];
710   }
711   for(;i<n;i++){
712     dest[i] = src1[i] * src2[i];
713   }
714 }
715 OIL_DEFINE_IMPL (multiply_f64_unroll4b, multiply_f64);
716 
717 static void
multiply_f64_unroll4c(oil_type_f64 * dest,oil_type_f64 * src1,oil_type_f64 * src2,int n)718 multiply_f64_unroll4c (oil_type_f64 *dest, oil_type_f64 *src1, oil_type_f64 *src2, int n)
719 {
720   int i;
721 
722   for(i=0;i<(n&(~0x3));i+=4){
723     *dest++ = *src1++ * *src2++;
724     *dest++ = *src1++ * *src2++;
725     *dest++ = *src1++ * *src2++;
726     *dest++ = *src1++ * *src2++;
727   }
728   for(;i<n;i++){
729     *dest++ = *src1++ * *src2++;
730   }
731 }
732 OIL_DEFINE_IMPL (multiply_f64_unroll4c, multiply_f64);
733 
734