1; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
2; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
3
4target triple = "nvptx64-unknown-cuda"
5
6;; Make sure we are generating proper instruction sequences for fused ops
7;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
8;; add.f32 otherwise.  Without an explicit rounding mode on add.f32, ptxas
9;; is free to fuse with a multiply if it is able.  If fusion is not allowed,
10;; we do not form fma.rn at the PTX level and explicitly generate add.rn
11;; for all adds to prevent ptxas from fusion the ops.
12
13;; FAST-LABEL: @t0
14;; DEFAULT-LABEL: @t0
15define float @t0(float %a, float %b, float %c) {
16;; FAST: fma.rn.f32
17;; DEFAULT: mul.rn.f32
18;; DEFAULT: add.rn.f32
19  %v0 = fmul float %a, %b
20  %v1 = fadd float %v0, %c
21  ret float %v1
22}
23
24;; FAST-LABEL: @t1
25;; DEFAULT-LABEL: @t1
26define float @t1(float %a, float %b) {
27;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
28;; to prevent ptxas from fusing this with anything else.
29;; FAST: add.f32
30;; DEFAULT: add.rn.f32
31  %v1 = fadd float %a, %b
32  ret float %v1
33}
34