1 /* { dg-do compile } */
2 /* { dg-options "-O2 -mavx2 -fdump-tree-optimized -Wno-psabi" } */
3 
4 typedef unsigned int u32v4 __attribute__((vector_size(16)));
5 typedef unsigned short u16v16 __attribute__((vector_size(32)));
6 typedef unsigned char u8v16 __attribute__((vector_size(16)));
7 
8 union vec128 {
9   u8v16 u8;
10   u32v4 u32;
11 };
12 
13 #define memcpy __builtin_memcpy
14 
zxt(u8v16 x)15 static u16v16 zxt(u8v16 x)
16 {
17   return (u16v16) {
18     x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
19     x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
20   };
21 }
22 
narrow(u16v16 x)23 static u8v16 narrow(u16v16 x)
24 {
25   return (u8v16) {
26     x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
27     x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
28   };
29 }
30 
f(char * dst,char * src,unsigned long n,unsigned c)31 void f(char *dst, char *src, unsigned long n, unsigned c)
32 {
33   unsigned ia = 255 - (c >> 24);
34   ia += ia >> 7;
35 
36   union vec128 c4 = {0}, ia16 = {0};
37   c4.u32 += c;
38   ia16.u8 += (unsigned char)ia;
39 
40   u16v16 c16 = (zxt(c4.u8) << 8) + 128;
41 
42   for (; n; src += 16, dst += 16, n -= 4) {
43     union vec128 s;
44     memcpy(&s, src, sizeof s);
45     s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8);
46     memcpy(dst, &s, sizeof s);
47   }
48 }
49 
50 /* { dg-final { scan-tree-dump-times "vec_unpack_lo" 3 "optimized" } } */
51 /* We're missing an opportunity to, after later optimizations, combine
52    a uniform CTOR with a vec_unpack_lo_expr to a CTOR on a converted
53    element.  */
54 /* { dg-final { scan-tree-dump-times "vec_unpack_lo" 2 "optimized" { xfail *-*-* } } } */
55 /* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
56 /* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
57