1 /* { dg-do compile } */
2 /* { dg-options "-O2 -msse2" } */
3 
4 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
5 typedef float __m128 __attribute__ ((__vector_size__ (16)));
6 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
7 
8 static __inline __m128
_mm_cmpeq_ps(__m128 __A,__m128 __B)9 _mm_cmpeq_ps (__m128 __A, __m128 __B)
10 {
11   return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
12 }
13 
14 static __inline __m128
_mm_setr_ps(float __Z,float __Y,float __X,float __W)15 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
16 {
17   return __extension__ (__m128)(__v4sf){__Z, __Y, __X, __W };
18 }
19 
20 static __inline __m128
_mm_and_si128(__m128 __A,__m128 __B)21 _mm_and_si128 (__m128 __A, __m128 __B)
22 {
23   return (__m128)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
24 }
25 
26 static __inline __m128
_mm_or_si128(__m128 __A,__m128 __B)27 _mm_or_si128 (__m128 __A, __m128 __B)
28 {
29   return (__m128)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
30 }
31 
32 typedef union
33 {
34   __m128 xmmi;
35   int si[4];
36 }
37 __attribute__ ((aligned (16))) um128;
38 
39 um128 u;
40 
41 static inline int
sse_max_abs_indexf(float * v,int step,int n)42 sse_max_abs_indexf (float *v, int step, int n)
43 {
44   __m128 m1, mm;
45   __m128 mim, mi, msk;
46   um128 u, ui;
47   int n4, step2, step3;
48   mm = __builtin_ia32_andps ((__m128) (__v4sf)
49 			     { 0.0, v[step], v[step2], v[step3] }
50 			     , u.xmmi);
51   if (n4)
52     {
53       int i;
54       for (i = 0; i < n4; ++i);
55       msk = (__m128) _mm_cmpeq_ps (m1, mm);
56       mim = _mm_or_si128 (_mm_and_si128 (msk, mi), mim);
57     }
58   ui.xmmi = (__m128) mim;
59   return ui.si[n];
60 }
61 
62 static void
sse_swap_rowf(float * r1,float * r2,int n)63 sse_swap_rowf (float *r1, float *r2, int n)
64 {
65   int n4 = (n / 4) * 4;
66   float *r14end = r1 + n4;
67   while (r1 < r14end)
68     {
69       *r1 = *r2;
70       r1++;
71     }
72 }
73 
74 void swap_index (int *, int, int);
75 void sse_add_rowf (float *, float *, float, int);
76 
77 void
ludcompf(float * m,int nw,int * prow,int n)78 ludcompf (float *m, int nw, int *prow, int n)
79 {
80   int i, s = 0;
81   float *pm;
82   for (i = 0, pm = m; i < n - 1; ++i, pm += nw)
83     {
84       int vi = sse_max_abs_indexf (pm + i, nw, n - i);
85       float *pt;
86       int j;
87       if (vi != 0)
88 	{
89 	  sse_swap_rowf (pm, pm + vi * nw, nw);
90 	  swap_index (prow, i, i + vi);
91 	}
92       for (j = i + 1, pt = pm + nw; j < n; ++j, pt += nw)
93 	sse_add_rowf (pt + i + 1, pm + i + 1, -1.0, n - i - 1);
94     }
95 }
96