1; Test loads of byte-swapped vector elements.
2;
3; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s
4
5declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
6declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
7declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
8
9; Test v8i16 loads.
10define <8 x i16> @f1(<8 x i16> *%ptr) {
11; CHECK-LABEL: f1:
12; CHECK: vlbrh %v24, 0(%r2)
13; CHECK: br %r14
14  %load = load <8 x i16>, <8 x i16> *%ptr
15  %ret = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %load)
16  ret <8 x i16> %ret
17}
18
19; Test v4i32 loads.
20define <4 x i32> @f2(<4 x i32> *%ptr) {
21; CHECK-LABEL: f2:
22; CHECK: vlbrf %v24, 0(%r2)
23; CHECK: br %r14
24  %load = load <4 x i32>, <4 x i32> *%ptr
25  %ret = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %load)
26  ret <4 x i32> %ret
27}
28
29; Test v2i64 loads.
30define <2 x i64> @f3(<2 x i64> *%ptr) {
31; CHECK-LABEL: f3:
32; CHECK: vlbrg %v24, 0(%r2)
33; CHECK: br %r14
34  %load = load <2 x i64>, <2 x i64> *%ptr
35  %ret = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %load)
36  ret <2 x i64> %ret
37}
38
39; Test the highest aligned in-range offset.
40define <4 x i32> @f4(<4 x i32> *%base) {
41; CHECK-LABEL: f4:
42; CHECK: vlbrf %v24, 4080(%r2)
43; CHECK: br %r14
44  %ptr = getelementptr <4 x i32>, <4 x i32> *%base, i64 255
45  %load = load <4 x i32>, <4 x i32> *%ptr
46  %ret = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %load)
47  ret <4 x i32> %ret
48}
49
50; Test the highest unaligned in-range offset.
51define <4 x i32> @f5(i8 *%base) {
52; CHECK-LABEL: f5:
53; CHECK: vlbrf %v24, 4095(%r2)
54; CHECK: br %r14
55  %addr = getelementptr i8, i8 *%base, i64 4095
56  %ptr = bitcast i8 *%addr to <4 x i32> *
57  %load = load <4 x i32>, <4 x i32> *%ptr
58  %ret = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %load)
59  ret <4 x i32> %ret
60}
61
62; Test the next offset up, which requires separate address logic,
63define <4 x i32> @f6(<4 x i32> *%base) {
64; CHECK-LABEL: f6:
65; CHECK: aghi %r2, 4096
66; CHECK: vlbrf %v24, 0(%r2)
67; CHECK: br %r14
68  %ptr = getelementptr <4 x i32>, <4 x i32> *%base, i64 256
69  %load = load <4 x i32>, <4 x i32> *%ptr
70  %ret = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %load)
71  ret <4 x i32> %ret
72}
73
74; Test negative offsets, which also require separate address logic,
75define <4 x i32> @f7(<4 x i32> *%base) {
76; CHECK-LABEL: f7:
77; CHECK: aghi %r2, -16
78; CHECK: vlbrf %v24, 0(%r2)
79; CHECK: br %r14
80  %ptr = getelementptr <4 x i32>, <4 x i32> *%base, i64 -1
81  %load = load <4 x i32>, <4 x i32> *%ptr
82  %ret = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %load)
83  ret <4 x i32> %ret
84}
85
86; Check that indexes are allowed.
87define <4 x i32> @f8(i8 *%base, i64 %index) {
88; CHECK-LABEL: f8:
89; CHECK: vlbrf %v24, 0(%r3,%r2)
90; CHECK: br %r14
91  %addr = getelementptr i8, i8 *%base, i64 %index
92  %ptr = bitcast i8 *%addr to <4 x i32> *
93  %load = load <4 x i32>, <4 x i32> *%ptr
94  %ret = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %load)
95  ret <4 x i32> %ret
96}
97
98