1declare void @llvm.trap() noreturn nounwind
2
3declare <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32>)
4declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>)
5declare <64 x i32> @llvm.hexagon.V6.vshuffvdd.128B(<32 x i32>, <32 x i32>, i32)
6declare <64 x i32> @llvm.hexagon.V6.vdealvdd.128B(<32 x i32>, <32 x i32>, i32)
7declare <32 x i32> @llvm.hexagon.V6.vasrwhsat.128B(<32 x i32>, <32 x i32>, i32)
8declare <32 x i32> @llvm.hexagon.V6.vsathub.128B(<32 x i32>, <32 x i32>)
9
10define weak_odr <64 x i32> @halide.hexagon.interleave.vw(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
11  %e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
12  %o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg)
13  %r = tail call <64 x i32> @llvm.hexagon.V6.vshuffvdd.128B(<32 x i32> %o, <32 x i32> %e, i32 -4)
14  ret <64 x i32> %r
15}
16
17define weak_odr <128 x i16> @halide.hexagon.interleave.vh(<128 x i16> %arg) nounwind uwtable readnone alwaysinline {
18  %arg_32 = bitcast <128 x i16> %arg to <64 x i32>
19  %e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg_32)
20  %o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg_32)
21  %r_32 = tail call <64 x i32> @llvm.hexagon.V6.vshuffvdd.128B(<32 x i32> %o, <32 x i32> %e, i32 -2)
22  %r = bitcast <64 x i32> %r_32 to <128 x i16>
23  ret <128 x i16> %r
24}
25
26define weak_odr <256 x i8> @halide.hexagon.interleave.vb(<256 x i8> %arg) nounwind uwtable readnone alwaysinline {
27  %arg_32 = bitcast <256 x i8> %arg to <64 x i32>
28  %e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg_32)
29  %o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg_32)
30  %r_32 = tail call <64 x i32> @llvm.hexagon.V6.vshuffvdd.128B(<32 x i32> %o, <32 x i32> %e, i32 -1)
31  %r = bitcast <64 x i32> %r_32 to <256 x i8>
32  ret <256 x i8> %r
33}
34
35
36declare <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32>, <32 x i32>)
37
38define weak_odr <64 x i32> @halide.hexagon.deinterleave.vw(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
39  %e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
40  %o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg)
41  %r = call <64 x i32> @llvm.hexagon.V6.vdealvdd.128B(<32 x i32> %o, <32 x i32> %e, i32 -4)
42  ret <64 x i32> %r
43}
44
45define weak_odr <128 x i16> @halide.hexagon.deinterleave.vh(<128 x i16> %arg) nounwind uwtable readnone alwaysinline {
46  %arg_32 = bitcast <128 x i16> %arg to <64 x i32>
47  %e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg_32)
48  %o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg_32)
49  %r_32 = call <64 x i32> @llvm.hexagon.V6.vdealvdd.128B(<32 x i32> %o, <32 x i32> %e, i32 -2)
50  %r = bitcast <64 x i32> %r_32 to <128 x i16>
51  ret <128 x i16> %r
52}
53
54define weak_odr <256 x i8> @halide.hexagon.deinterleave.vb(<256 x i8> %arg) nounwind uwtable readnone alwaysinline {
55  %arg_32 = bitcast <256 x i8> %arg to <64 x i32>
56  %e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg_32)
57  %o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg_32)
58  %r_32 = call <64 x i32> @llvm.hexagon.V6.vdealvdd.128B(<32 x i32> %o, <32 x i32> %e, i32 -1)
59  %r = bitcast <64 x i32> %r_32 to <256 x i8>
60  ret <256 x i8> %r
61}
62
63declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32)
64declare i32 @llvm.hexagon.S2.vsplatrb(i32)
65
66
67define weak_odr i16 @halide.hexagon.dup2.b(i8 %arg) nounwind uwtable readnone alwaysinline {
68  %arg_i16 = zext i8 %arg to i16
69  %arg_i16_s = shl i16 %arg_i16, 8
70  %r = or i16 %arg_i16, %arg_i16_s
71  ret i16 %r
72}
73
74define weak_odr i32 @halide.hexagon.dup2.h(i16 %arg) nounwind uwtable readnone alwaysinline {
75  %arg_i32 = zext i16 %arg to i32
76  %arg_i32_s = shl i32 %arg_i32, 16
77  %r = or i32 %arg_i32, %arg_i32_s
78  ret i32 %r
79}
80
81define weak_odr i32 @halide.hexagon.dup4.b(i8 %arg) nounwind uwtable readnone alwaysinline {
82  %arg_i32 = zext i8 %arg to i32
83  %dup4 = tail call i32 @llvm.hexagon.S2.vsplatrb(i32 %arg_i32)
84  ret i32 %dup4
85}
86
87define weak_odr i32 @halide.hexagon.interleave.b.dup2.h(i8 %low, i8 %high) nounwind uwtable readnone alwaysinline {
88  %high_i16 = zext i8 %high to i16
89  %high_i16_s = shl i16 %high_i16, 8
90  %low_i16 = zext i8 %low to i16
91  %i16_const = or i16 %high_i16_s, %low_i16
92  %r = call i32 @halide.hexagon.dup2.h(i16 %i16_const)
93  ret i32 %r
94}
95
96define weak_odr <128 x i8> @halide.hexagon.splat.b(i8 %arg) nounwind uwtable readnone alwaysinline {
97  %dup4 = call i32 @halide.hexagon.dup4.b(i8 %arg)
98  %r_32 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 %dup4)
99  %r = bitcast <32 x i32> %r_32 to <128 x i8>
100  ret <128 x i8> %r
101}
102
103define weak_odr <64 x i16> @halide.hexagon.splat.h(i16 %arg) nounwind uwtable readnone alwaysinline {
104  %dup2 = call i32 @halide.hexagon.dup2.h(i16 %arg)
105  %r_32 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 %dup2)
106  %r = bitcast <32 x i32> %r_32 to <64 x i16>
107  ret <64 x i16> %r
108}
109
110; Implement various 32 bit multiplications.
111declare <32 x i32> @llvm.hexagon.V6.vaslw.128B(<32 x i32>, i32)
112declare <32 x i32> @llvm.hexagon.V6.vaslw.acc.128B(<32 x i32>, <32 x i32>, i32)
113declare <32 x i32> @llvm.hexagon.V6.vlsrw.128B(<32 x i32>, i32)
114declare <32 x i32> @llvm.hexagon.V6.vmpyieoh.128B(<32 x i32>, <32 x i32>)
115declare <32 x i32> @llvm.hexagon.V6.vmpyiowh.128B(<32 x i32>, <32 x i32>)
116declare <32 x i32> @llvm.hexagon.V6.vmpyiewuh.128B(<32 x i32>, <32 x i32>)
117declare <32 x i32> @llvm.hexagon.V6.vmpyiewuh.acc.128B(<32 x i32>, <32 x i32>, <32 x i32>)
118declare <32 x i32> @llvm.hexagon.V6.vshufeh.128B(<32 x i32>, <32 x i32>)
119declare <32 x i32> @llvm.hexagon.V6.vshufoh.128B(<32 x i32>, <32 x i32>)
120declare <64 x i32> @llvm.hexagon.V6.vmpyuhv.128B(<32 x i32>, <32 x i32>)
121declare <64 x i32> @llvm.hexagon.V6.vmpyuhv.acc.128B(<64 x i32>, <32 x i32>, <32 x i32>)
122
123define weak_odr <32 x i32> @halide.hexagon.mul.vw.vw(<32 x i32> %a, <32 x i32> %b) nounwind uwtable readnone alwaysinline {
124  %ab1 = call <32 x i32> @llvm.hexagon.V6.vmpyieoh.128B(<32 x i32> %a, <32 x i32> %b)
125  %ab = call <32 x i32> @llvm.hexagon.V6.vmpyiewuh.acc.128B(<32 x i32> %ab1, <32 x i32> %a, <32 x i32> %b)
126  ret <32 x i32> %ab
127}
128
129define weak_odr <64 x i32> @halide.hexagon.mul.vw.vh(<64 x i32> %a, <64 x i16> %b) nounwind uwtable readnone alwaysinline {
130  %a_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %a)
131  %a_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %a)
132  %b_hi = bitcast <64 x i16> %b to <32 x i32>
133  %b_lo = call <32 x i32> @llvm.hexagon.V6.vaslw.128B(<32 x i32> %b_hi, i32 16)
134  %ab_lo = call <32 x i32> @llvm.hexagon.V6.vmpyiowh.128B(<32 x i32> %a_lo, <32 x i32> %b_lo)
135  %ab_hi = call <32 x i32> @llvm.hexagon.V6.vmpyiowh.128B(<32 x i32> %a_hi, <32 x i32> %b_hi)
136  %ab = call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %ab_hi, <32 x i32> %ab_lo)
137  ret <64 x i32> %ab
138}
139
140define weak_odr <64 x i32> @halide.hexagon.mul.vw.vuh(<64 x i32> %a, <64 x i16> %b) nounwind uwtable readnone alwaysinline {
141  %a_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %a)
142  %a_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %a)
143  %b_lo = bitcast <64 x i16> %b to <32 x i32>
144  %b_hi = call <32 x i32> @llvm.hexagon.V6.vlsrw.128B(<32 x i32> %b_lo, i32 16)
145  %ab_lo = call <32 x i32> @llvm.hexagon.V6.vmpyiewuh.128B(<32 x i32> %a_lo, <32 x i32> %b_lo)
146  %ab_hi = call <32 x i32> @llvm.hexagon.V6.vmpyiewuh.128B(<32 x i32> %a_hi, <32 x i32> %b_hi)
147  %ab = call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %ab_hi, <32 x i32> %ab_lo)
148  ret <64 x i32> %ab
149}
150
151; Do vaslw.acc on double vectors.
152define private <64 x i32> @vaslw.acc.dv.128B(<64 x i32> %a, <64 x i32> %l, i32 %r) nounwind uwtable readnone alwaysinline {
153  %a_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %a)
154  %l_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %l)
155  %s_lo = call <32 x i32> @llvm.hexagon.V6.vaslw.acc.128B(<32 x i32> %a_lo, <32 x i32> %l_lo, i32 %r)
156  %a_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %a)
157  %l_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %l)
158  %s_hi = call <32 x i32> @llvm.hexagon.V6.vaslw.acc.128B(<32 x i32> %a_hi, <32 x i32> %l_hi, i32 %r)
159  %s = call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %s_hi, <32 x i32> %s_lo)
160  ret <64 x i32> %s
161}
162
163define weak_odr <64 x i32> @halide.hexagon.mul.vuw.vuh(<64 x i32> %a, <64 x i16> %b) nounwind uwtable readnone alwaysinline {
164  %a_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %a)
165  %a_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %a)
166  %a_e = call <32 x i32> @llvm.hexagon.V6.vshufeh.128B(<32 x i32> %a_hi, <32 x i32> %a_lo)
167  %a_o = call <32 x i32> @llvm.hexagon.V6.vshufoh.128B(<32 x i32> %a_hi, <32 x i32> %a_lo)
168  %b_32 = bitcast <64 x i16> %b to <32 x i32>
169  %ab_e = call <64 x i32> @llvm.hexagon.V6.vmpyuhv.128B(<32 x i32> %a_e, <32 x i32> %b_32)
170  %ab_o = call <64 x i32> @llvm.hexagon.V6.vmpyuhv.128B(<32 x i32> %a_o, <32 x i32> %b_32)
171  %ab = call <64 x i32> @vaslw.acc.dv.128B(<64 x i32> %ab_e, <64 x i32> %ab_o, i32 16)
172  ret <64 x i32> %ab
173}
174
175define weak_odr <64 x i32> @halide.hexagon.mul.vuw.vuw(<64 x i32> %a, <64 x i32> %b) nounwind uwtable readnone alwaysinline {
176  %a_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %a)
177  %a_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %a)
178  %b_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %b)
179  %b_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %b)
180  %a_e = call <32 x i32> @llvm.hexagon.V6.vshufeh.128B(<32 x i32> %a_hi, <32 x i32> %a_lo)
181  %a_o = call <32 x i32> @llvm.hexagon.V6.vshufoh.128B(<32 x i32> %a_hi, <32 x i32> %a_lo)
182  %b_e = call <32 x i32> @llvm.hexagon.V6.vshufeh.128B(<32 x i32> %b_hi, <32 x i32> %b_lo)
183  %b_o = call <32 x i32> @llvm.hexagon.V6.vshufoh.128B(<32 x i32> %b_hi, <32 x i32> %b_lo)
184  %ab_e = call <64 x i32> @llvm.hexagon.V6.vmpyuhv.128B(<32 x i32> %a_e, <32 x i32> %b_e)
185  %ab_o1 = call <64 x i32> @llvm.hexagon.V6.vmpyuhv.128B(<32 x i32> %a_o, <32 x i32> %b_e)
186  %ab_o = call <64 x i32> @llvm.hexagon.V6.vmpyuhv.acc.128B(<64 x i32> %ab_o1, <32 x i32> %a_e, <32 x i32> %b_o)
187  %ab = call <64 x i32> @vaslw.acc.dv.128B(<64 x i32> %ab_e, <64 x i32> %ab_o, i32 16)
188  ret <64 x i32> %ab
189}
190
191; 32 bit multiply keep high half.
192declare <32 x i32> @llvm.hexagon.V6.vmpyewuh.128B(<32 x i32>, <32 x i32>)
193declare <32 x i32> @llvm.hexagon.V6.vmpyowh.sacc.128B(<32 x i32>, <32 x i32>, <32 x i32>)
194declare <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.sacc.128B(<32 x i32>, <32 x i32>, <32 x i32>)
195declare <32 x i32> @llvm.hexagon.V6.vasrw.128B(<32 x i32>, i32)
196
197define weak_odr <32 x i32> @halide.hexagon.trunc_mpy.vw.vw(<32 x i32> %a, <32 x i32> %b) nounwind uwtable readnone alwaysinline {
198  %ab1 = call <32 x i32> @llvm.hexagon.V6.vmpyewuh.128B(<32 x i32> %a, <32 x i32> %b)
199  %ab2 = call <32 x i32> @llvm.hexagon.V6.vmpyowh.sacc.128B(<32 x i32> %ab1, <32 x i32> %a, <32 x i32> %b)
200  %ab = call <32 x i32> @llvm.hexagon.V6.vasrw.128B(<32 x i32> %ab2, i32 1)
201  ret <32 x i32> %ab
202}
203
204define weak_odr <32 x i32> @halide.hexagon.trunc_satdw_mpy2.vw.vw(<32 x i32> %a, <32 x i32> %b) nounwind uwtable readnone alwaysinline {
205  %ab1 = call <32 x i32> @llvm.hexagon.V6.vmpyewuh.128B(<32 x i32> %a, <32 x i32> %b)
206  %ab = call <32 x i32> @llvm.hexagon.V6.vmpyowh.sacc.128B(<32 x i32> %ab1, <32 x i32> %a, <32 x i32> %b)
207  ret <32 x i32> %ab
208}
209
210define weak_odr <32 x i32> @halide.hexagon.trunc_satdw_mpy2_rnd.vw.vw(<32 x i32> %a, <32 x i32> %b) nounwind uwtable readnone alwaysinline {
211  %ab1 = call <32 x i32> @llvm.hexagon.V6.vmpyewuh.128B(<32 x i32> %a, <32 x i32> %b)
212  %ab = call <32 x i32> @llvm.hexagon.V6.vmpyowh.rnd.sacc.128B(<32 x i32> %ab1, <32 x i32> %a, <32 x i32> %b)
213  ret <32 x i32> %ab
214}
215
216; Hexagon is missing shifts for byte sized operands.
217declare <32 x i32> @llvm.hexagon.V6.vaslh.128B(<32 x i32>, i32)
218declare <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32>, i32)
219declare <32 x i32> @llvm.hexagon.V6.vlsrh.128B(<32 x i32>, i32)
220declare <32 x i32> @llvm.hexagon.V6.vaslhv.128B(<32 x i32>, <32 x i32>)
221declare <32 x i32> @llvm.hexagon.V6.vasrhv.128B(<32 x i32>, <32 x i32>)
222declare <32 x i32> @llvm.hexagon.V6.vlsrhv.128B(<32 x i32>, <32 x i32>)
223declare <64 x i32> @llvm.hexagon.V6.vzb.128B(<32 x i32>)
224declare <64 x i32> @llvm.hexagon.V6.vsb.128B(<32 x i32>)
225declare <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32>, <32 x i32>)
226
227define weak_odr <128 x i8> @halide.hexagon.shl.vub.b(<128 x i8> %a, i8 %b) nounwind uwtable readnone alwaysinline {
228  %a_32 = bitcast <128 x i8> %a to <32 x i32>
229  %bw = sext i8 %b to i32
230  %aw = call <64 x i32> @llvm.hexagon.V6.vzb.128B(<32 x i32> %a_32)
231  %aw_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %aw)
232  %sw_lo = call <32 x i32> @llvm.hexagon.V6.vaslh.128B(<32 x i32> %aw_lo, i32 %bw)
233  %aw_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %aw)
234  %sw_hi = call <32 x i32> @llvm.hexagon.V6.vaslh.128B(<32 x i32> %aw_hi, i32 %bw)
235  %r_32 = tail call <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32> %sw_hi, <32 x i32> %sw_lo)
236  %r = bitcast <32 x i32> %r_32 to <128 x i8>
237  ret <128 x i8> %r
238}
239
240define weak_odr <128 x i8> @halide.hexagon.shl.vb.b(<128 x i8> %a, i8 %b) nounwind uwtable readnone alwaysinline {
241  ; A shift left is the same whether it is signed or not.
242  %u = tail call <128 x i8> @halide.hexagon.shl.vub.b(<128 x i8> %a, i8 %b)
243  ret <128 x i8> %u
244}
245
246define weak_odr <128 x i8> @halide.hexagon.shr.vub.b(<128 x i8> %a, i8 %b) nounwind uwtable readnone alwaysinline {
247  %a_32 = bitcast <128 x i8> %a to <32 x i32>
248  %bw = sext i8 %b to i32
249  %aw = call <64 x i32> @llvm.hexagon.V6.vzb.128B(<32 x i32> %a_32)
250  %aw_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %aw)
251  %sw_lo = call <32 x i32> @llvm.hexagon.V6.vlsrh.128B(<32 x i32> %aw_lo, i32 %bw)
252  %aw_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %aw)
253  %sw_hi = call <32 x i32> @llvm.hexagon.V6.vlsrh.128B(<32 x i32> %aw_hi, i32 %bw)
254  %r_32 = tail call <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32> %sw_hi, <32 x i32> %sw_lo)
255  %r = bitcast <32 x i32> %r_32 to <128 x i8>
256  ret <128 x i8> %r
257}
258
259define weak_odr <128 x i8> @halide.hexagon.shr.vb.b(<128 x i8> %a, i8 %b) nounwind uwtable readnone alwaysinline {
260  %a_32 = bitcast <128 x i8> %a to <32 x i32>
261  %bw = sext i8 %b to i32
262  %aw = call <64 x i32> @llvm.hexagon.V6.vsb.128B(<32 x i32> %a_32)
263  %aw_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %aw)
264  %sw_lo = call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %aw_lo, i32 %bw)
265  %aw_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %aw)
266  %sw_hi = call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %aw_hi, i32 %bw)
267  %r_32 = tail call <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32> %sw_hi, <32 x i32> %sw_lo)
268  %r = bitcast <32 x i32> %r_32 to <128 x i8>
269  ret <128 x i8> %r
270}
271
272
273
274define weak_odr <128 x i8> @halide.hexagon.shl.vub.vb(<128 x i8> %a, <128 x i8> %b) nounwind uwtable readnone alwaysinline {
275  %a_32 = bitcast <128 x i8> %a to <32 x i32>
276  %b_32 = bitcast <128 x i8> %b to <32 x i32>
277  %aw = call <64 x i32> @llvm.hexagon.V6.vzb.128B(<32 x i32> %a_32)
278  %bw = call <64 x i32> @llvm.hexagon.V6.vsb.128B(<32 x i32> %b_32)
279  %aw_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %aw)
280  %bw_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %bw)
281  %sw_lo = call <32 x i32> @llvm.hexagon.V6.vaslhv.128B(<32 x i32> %aw_lo, <32 x i32> %bw_lo)
282  %aw_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %aw)
283  %bw_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %bw)
284  %sw_hi = call <32 x i32> @llvm.hexagon.V6.vaslhv.128B(<32 x i32> %aw_hi, <32 x i32> %bw_hi)
285  %r_32 = tail call <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32> %sw_hi, <32 x i32> %sw_lo)
286  %r = bitcast <32 x i32> %r_32 to <128 x i8>
287  ret <128 x i8> %r
288}
289
290define weak_odr <128 x i8> @halide.hexagon.shl.vb.vb(<128 x i8> %a, <128 x i8> %b) nounwind uwtable readnone alwaysinline {
291  ; A shift left is the same whether it is signed or not.
292  %u = tail call <128 x i8> @halide.hexagon.shl.vub.vb(<128 x i8> %a, <128 x i8> %b)
293  ret <128 x i8> %u
294}
295
296define weak_odr <128 x i8> @halide.hexagon.shr.vub.vb(<128 x i8> %a, <128 x i8> %b) nounwind uwtable readnone alwaysinline {
297  %a_32 = bitcast <128 x i8> %a to <32 x i32>
298  %b_32 = bitcast <128 x i8> %b to <32 x i32>
299  %aw = call <64 x i32> @llvm.hexagon.V6.vzb.128B(<32 x i32> %a_32)
300  %bw = call <64 x i32> @llvm.hexagon.V6.vsb.128B(<32 x i32> %b_32)
301  %aw_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %aw)
302  %bw_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %bw)
303  %sw_lo = call <32 x i32> @llvm.hexagon.V6.vlsrhv.128B(<32 x i32> %aw_lo, <32 x i32> %bw_lo)
304  %aw_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %aw)
305  %bw_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %bw)
306  %sw_hi = call <32 x i32> @llvm.hexagon.V6.vlsrhv.128B(<32 x i32> %aw_hi, <32 x i32> %bw_hi)
307  %r_32 = tail call <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32> %sw_hi, <32 x i32> %sw_lo)
308  %r = bitcast <32 x i32> %r_32 to <128 x i8>
309  ret <128 x i8> %r
310}
311
312define weak_odr <128 x i8> @halide.hexagon.shr.vb.vb(<128 x i8> %a, <128 x i8> %b) nounwind uwtable readnone alwaysinline {
313  %a_32 = bitcast <128 x i8> %a to <32 x i32>
314  %b_32 = bitcast <128 x i8> %b to <32 x i32>
315  %aw = call <64 x i32> @llvm.hexagon.V6.vsb.128B(<32 x i32> %a_32)
316  %bw = call <64 x i32> @llvm.hexagon.V6.vsb.128B(<32 x i32> %b_32)
317  %aw_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %aw)
318  %bw_lo = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %bw)
319  %sw_lo = call <32 x i32> @llvm.hexagon.V6.vasrhv.128B(<32 x i32> %aw_lo, <32 x i32> %bw_lo)
320  %aw_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %aw)
321  %bw_hi = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %bw)
322  %sw_hi = call <32 x i32> @llvm.hexagon.V6.vasrhv.128B(<32 x i32> %aw_hi, <32 x i32> %bw_hi)
323  %r_32 = tail call <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32> %sw_hi, <32 x i32> %sw_lo)
324  %r = bitcast <32 x i32> %r_32 to <128 x i8>
325  ret <128 x i8> %r
326}
327
328declare <64 x i32> @llvm.hexagon.V6.vmpabus.128B(<64 x i32>, i32)
329declare <64 x i32> @llvm.hexagon.V6.vmpabus.acc.128B(<64 x i32>, <64 x i32>, i32)
330
331define weak_odr <128 x i16> @halide.hexagon.add_2mpy.vub.vub.b.b(<128 x i8> %low_v, <128 x i8> %high_v, i8 %low_c, i8 %high_c) nounwind uwtable readnone {
332  %const = call i32 @halide.hexagon.interleave.b.dup2.h(i8 %low_c, i8 %high_c)
333  %low = bitcast <128 x i8> %low_v to <32 x i32>
334  %high = bitcast <128 x i8> %high_v to <32 x i32>
335  %dv = call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %high, <32 x i32> %low)
336  %res = call <64 x i32> @llvm.hexagon.V6.vmpabus.128B(<64 x i32> %dv, i32 %const)
337  %ret_val = bitcast <64 x i32> %res to <128 x i16>
338  ret <128 x i16> %ret_val
339}
340
341define weak_odr <128 x i16> @halide.hexagon.acc_add_2mpy.vh.vub.vub.b.b(<128 x i16> %acc, <128 x i8> %low_v, <128 x i8> %high_v, i8 %low_c, i8 %high_c) nounwind uwtable readnone {
342  %dv0 = bitcast <128 x i16> %acc to <64 x i32>
343  %const = call i32 @halide.hexagon.interleave.b.dup2.h(i8 %low_c, i8 %high_c)
344  %low = bitcast <128 x i8> %low_v to <32 x i32>
345  %high = bitcast <128 x i8> %high_v to <32 x i32>
346  %dv1 = call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %high, <32 x i32> %low)
347  %res = call <64 x i32> @llvm.hexagon.V6.vmpabus.acc.128B(<64 x i32> %dv0, <64 x i32> %dv1, i32 %const)
348  %ret_val = bitcast <64 x i32> %res to <128 x i16>
349  ret <128 x i16> %ret_val
350}
351
352declare <64 x i32> @llvm.hexagon.V6.vmpahb.128B(<64 x i32>, i32)
353declare <64 x i32> @llvm.hexagon.V6.vmpahb.acc.128B(<64 x i32>, <64 x i32>, i32)
354
355define weak_odr <64 x i32> @halide.hexagon.add_2mpy.vh.vh.b.b(<64 x i16> %low_v, <64 x i16> %high_v, i8 %low_c, i8 %high_c) nounwind uwtable readnone {
356  %const = call i32 @halide.hexagon.interleave.b.dup2.h(i8 %low_c, i8 %high_c)
357  %low = bitcast <64 x i16> %low_v to <32 x i32>
358  %high = bitcast <64 x i16> %high_v to <32 x i32>
359  %dv = call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %high, <32 x i32> %low)
360  %res = call <64 x i32> @llvm.hexagon.V6.vmpahb.128B(<64 x i32> %dv, i32 %const)
361  ret <64 x i32> %res
362}
363
364define weak_odr <64 x i32> @halide.hexagon.acc_add_2mpy.vw.vh.vh.b.b(<64 x i32> %acc, <64 x i16> %low_v, <64 x i16> %high_v, i8 %low_c, i8 %high_c) nounwind uwtable readnone {
365  %const = call i32 @halide.hexagon.interleave.b.dup2.h(i8 %low_c, i8 %high_c)
366  %low = bitcast <64 x i16> %low_v to <32 x i32>
367  %high = bitcast <64 x i16> %high_v to <32 x i32>
368  %dv1 = call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %high, <32 x i32> %low)
369  %res = call <64 x i32> @llvm.hexagon.V6.vmpahb.acc.128B(<64 x i32> %acc, <64 x i32> %dv1, i32 %const)
370  ret <64 x i32> %res
371}
372
373; Define a missing saturating narrow instruction in terms of a saturating narrowing shift.
374declare <32 x i32> @llvm.hexagon.V6.vasrwuhsat.128B(<32 x i32>, <32 x i32>, i32)
375
376define weak_odr <64 x i16> @halide.hexagon.trunc_satuh.vw(<64 x i32> %arg) nounwind uwtable readnone alwaysinline {
377  %e = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %arg)
378  %o = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %arg)
379  %r_32 = call <32 x i32> @llvm.hexagon.V6.vasrwuhsat.128B(<32 x i32> %o, <32 x i32> %e, i32 0)
380  %r = bitcast <32 x i32> %r_32 to <64 x i16>
381  ret <64 x i16> %r
382}
383
384declare <64 x i32> @llvm.hexagon.V6.vtmpybus.128B(<64 x i32>, i32)
385declare <64 x i32> @llvm.hexagon.V6.vtmpyb.128B(<64 x i32>, i32)
386declare <64 x i32> @llvm.hexagon.V6.vtmpyhb.128B(<64 x i32>, i32)
387
388define weak_odr <128 x i16> @halide.hexagon.vtmpy.vub.vub.b.b(<128 x i8> %low_v, <128 x i8> %high_v, i8 %low_c, i8 %high_c) nounwind uwtable readnone {
389  %const = call i32 @halide.hexagon.interleave.b.dup2.h(i8 %low_c, i8 %high_c)
390  %low = bitcast <128 x i8> %low_v to <32 x i32>
391  %high = bitcast <128 x i8> %high_v to <32 x i32>
392  %dv = call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %high, <32 x i32> %low)
393  %res = call <64 x i32> @llvm.hexagon.V6.vtmpybus.128B(<64 x i32> %dv, i32 %const)
394  %ret_val = bitcast <64 x i32> %res to <128 x i16>
395  ret <128 x i16> %ret_val
396}
397
398define weak_odr <128 x i16> @halide.hexagon.vtmpy.vb.vb.b.b(<128 x i8> %low_v, <128 x i8> %high_v, i8 %low_c, i8 %high_c) nounwind uwtable readnone {
399  %const = call i32 @halide.hexagon.interleave.b.dup2.h(i8 %low_c, i8 %high_c)
400  %low = bitcast <128 x i8> %low_v to <32 x i32>
401  %high = bitcast <128 x i8> %high_v to <32 x i32>
402  %dv = call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %high, <32 x i32> %low)
403  %res = call <64 x i32> @llvm.hexagon.V6.vtmpyb.128B(<64 x i32> %dv, i32 %const)
404  %ret_val = bitcast <64 x i32> %res to <128 x i16>
405  ret <128 x i16> %ret_val
406}
407
408define weak_odr <64 x i32> @halide.hexagon.vtmpy.vh.vh.b.b(<64 x i16> %low_v, <64 x i16> %high_v, i8 %low_c, i8 %high_c) nounwind uwtable readnone {
409  %const = call i32 @halide.hexagon.interleave.b.dup2.h(i8 %low_c, i8 %high_c)
410  %low = bitcast <64 x i16> %low_v to <32 x i32>
411  %high = bitcast <64 x i16> %high_v to <32 x i32>
412  %dv = call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %high, <32 x i32> %low)
413  %res = call <64 x i32> @llvm.hexagon.V6.vtmpyhb.128B(<64 x i32> %dv, i32 %const)
414  ret <64 x i32> %res
415}
416
417declare void @llvm.hexagon.V6.vgathermh.128B(i8*, i32, i32, <32 x i32>)
418declare void @llvm.hexagon.V6.vgathermw.128B(i8*, i32, i32, <32 x i32>)
419
420define weak_odr void @halide.hexagon.vgather.h.h(i8* %dst_base, i32 %dst_index, i8* %src_ptr, i32 %size, <64 x i16> %index) nounwind uwtable {
421  %index32 = bitcast <64 x i16> %index to <32 x i32>
422  %src = ptrtoint i8* %src_ptr to i32
423  %dst_16base = bitcast i8* %dst_base to i16*
424  %dst_16ptr = getelementptr i16, i16* %dst_16base, i32 %dst_index
425  %dst_ptr = bitcast i16* %dst_16ptr to i8*
426  call void @llvm.hexagon.V6.vgathermh.128B(i8* %dst_ptr, i32 %src, i32 %size, <32 x i32> %index32)
427  ret void
428}
429
430define weak_odr void @halide.hexagon.vgather.w.w(i8* %dst_base, i32 %dst_index, i8* %src_ptr, i32 %size, <32 x i32> %index) nounwind uwtable {
431  %src = ptrtoint i8* %src_ptr to i32
432  %dst_32base = bitcast i8* %dst_base to i32*
433  %dst_32ptr = getelementptr i32, i32* %dst_32base, i32 %dst_index
434  %dst_ptr = bitcast i32* %dst_32ptr to i8*
435  call void @llvm.hexagon.V6.vgathermw.128B(i8* %dst_ptr, i32 %src, i32 %size, <32 x i32> %index)
436  ret void
437}
438
439declare void @llvm.hexagon.V6.vscattermh.128B(i32, i32, <32 x i32>, <32 x i32>)
440declare void @llvm.hexagon.V6.vscattermw.128B(i32, i32, <32 x i32>, <32 x i32>)
441
442define weak_odr void @halide.hexagon.vscatter.h.h(i8* %buf_ptr, i32 %size, <64 x i16> %idx, <64 x i16> %val) nounwind uwtable writeonly {
443  %idx32 = bitcast <64 x i16> %idx to <32 x i32>
444  %val32 = bitcast <64 x i16> %val to <32 x i32>
445  %buf = ptrtoint i8* %buf_ptr to i32
446  call void @llvm.hexagon.V6.vscattermh.128B(i32 %buf, i32 %size, <32 x i32> %idx32, <32 x i32> %val32) nounwind writeonly
447  ret void
448}
449
450define weak_odr void @halide.hexagon.vscatter.w.w(i8* %buf_ptr, i32 %size, <32 x i32> %idx, <32 x i32> %val) nounwind uwtable writeonly {
451  %buf = ptrtoint i8* %buf_ptr to i32
452  call void @llvm.hexagon.V6.vscattermw.128B(i32 %buf, i32 %size, <32 x i32> %idx, <32 x i32> %val)
453  ret void
454}
455
456declare void @llvm.hexagon.V6.vscattermh.add.128B(i32, i32, <32 x i32>, <32 x i32>)
457declare void @llvm.hexagon.V6.vscattermw.add.128B(i32, i32, <32 x i32>, <32 x i32>)
458
459define weak_odr void @halide.hexagon.vscatter_acc.h.h(i8* %buf_ptr, i32 %size, <64 x i16> %idx, <64 x i16> %val) nounwind uwtable writeonly {
460  %idx32 = bitcast <64 x i16> %idx to <32 x i32>
461  %val32 = bitcast <64 x i16> %val to <32 x i32>
462  %buf = ptrtoint i8* %buf_ptr to i32
463  call void @llvm.hexagon.V6.vscattermh.add.128B(i32 %buf, i32 %size, <32 x i32> %idx32, <32 x i32> %val32) nounwind writeonly
464  ret void
465}
466
467define weak_odr void @halide.hexagon.vscatter_acc.w.w(i8* %buf_ptr, i32 %size, <32 x i32> %idx, <32 x i32> %val) nounwind uwtable writeonly {
468  %buf = ptrtoint i8* %buf_ptr to i32
469  call void @llvm.hexagon.V6.vscattermw.add.128B(i32 %buf, i32 %size, <32 x i32> %idx, <32 x i32> %val)
470  ret void
471}
472
473define weak_odr void @halide.hexagon.scatter.release(i8* %ptr) nounwind uwtable {
474  call void asm sideeffect "vmem($0 + #0):scatter_release\0A; v1 = vmem($0 + #0)\0A", "=*m,*m,~{v1}"(i8* %ptr, i8* %ptr)
475  ret void
476}
477