1 extern void abort (void);
2
3 typedef float v4flt __attribute__ ((vector_size (16)));
4
foo(float * dst,float ** src,int a,int n)5 void __attribute__ ((noinline)) foo (float *dst, float **src, int a, int n)
6 {
7 int i, j;
8 int z = sizeof (v4flt) / sizeof (float);
9 unsigned m = sizeof (v4flt) - 1;
10
11 for (j = 0; j < n && (((unsigned long) dst + j) & m); ++j)
12 {
13 float t = src[0][j];
14 for (i = 1; i < a; ++i)
15 t += src[i][j];
16 dst[j] = t;
17 }
18
19 for (; j < (n - (4 * z - 1)); j += 4 * z)
20 {
21 v4flt t0 = *(v4flt *) (src[0] + j + 0 * z);
22 v4flt t1 = *(v4flt *) (src[0] + j + 1 * z);
23 v4flt t2 = *(v4flt *) (src[0] + j + 2 * z);
24 v4flt t3 = *(v4flt *) (src[0] + j + 3 * z);
25 for (i = 1; i < a; ++i)
26 {
27 t0 += *(v4flt *) (src[i] + j + 0 * z);
28 t1 += *(v4flt *) (src[i] + j + 1 * z);
29 t2 += *(v4flt *) (src[i] + j + 2 * z);
30 t3 += *(v4flt *) (src[i] + j + 3 * z);
31 }
32 *(v4flt *) (dst + j + 0 * z) = t0;
33 *(v4flt *) (dst + j + 1 * z) = t1;
34 *(v4flt *) (dst + j + 2 * z) = t2;
35 *(v4flt *) (dst + j + 3 * z) = t3;
36 }
37 for (; j < n; ++j)
38 {
39 float t = src[0][j];
40 for (i = 1; i < a; ++i)
41 t += src[i][j];
42 dst[j] = t;
43 }
44 }
45
46 float buffer[64];
47
48 int
main(void)49 main (void)
50 {
51 int i;
52 float *dst, *src[2];
53 char *cptr;
54
55 cptr = (char *)buffer;
56 cptr += (-(long int) buffer & (16 * sizeof (float) - 1));
57 dst = (float *)cptr;
58 src[0] = dst + 16;
59 src[1] = dst + 32;
60 for (i = 0; i < 16; ++i)
61 {
62 src[0][i] = (float) i + 11 * (float) i;
63 src[1][i] = (float) i + 12 * (float) i;
64 }
65 foo (dst, src, 2, 16);
66 for (i = 0; i < 16; ++i)
67 {
68 float e = (float) i + 11 * (float) i + (float) i + 12 * (float) i;
69 if (dst[i] != e)
70 abort ();
71 }
72 return 0;
73 }
74