1;; Scheduling description for Niagara-7
2;;   Copyright (C) 2016-2021 Free Software Foundation, Inc.
3;;
4;; This file is part of GCC.
5;;
6;; GCC is free software; you can redistribute it and/or modify
7;; it under the terms of the GNU General Public License as published by
8;; the Free Software Foundation; either version 3, or (at your option)
9;; any later version.
10;;
11;; GCC is distributed in the hope that it will be useful,
12;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14;; GNU General Public License for more details.
15;;
16;; You should have received a copy of the GNU General Public License
17;; along with GCC; see the file COPYING3.  If not see
18;; <http://www.gnu.org/licenses/>.
19
20(define_automaton "niagara7_0")
21
22;; The S4 core has a dual-issue queue.  This queue is divided into two
23;; slots.  One instruction can be issued each cycle to each slot, and
24;; up to 2 instructions are committed each cycle.  Each slot serves
25;; several execution units, as depicted below:
26;;
27;;
28;;                 m7_slot0 - Integer unit.
29;;                          - Load/Store unit.
30;; === QUEUE ==>
31;;
32;;                 m7_slot1 - Integer unit.
33;;                          - Branch unit.
34;;                          - Floating-point and graphics unit.
35;;                          - 3-cycles crypto unit.
36
37(define_cpu_unit "n7_slot0,n7_slot1" "niagara7_0")
38
39;; Some instructions stall the pipeline and avoid any other
40;; instruction to be issued in the same cycle.  We assume the same for
41;; multi-instruction insns.
42
43(define_reservation "n7_single_issue" "n7_slot0 + n7_slot1")
44
45(define_insn_reservation "n7_single" 1
46  (and (eq_attr "cpu" "niagara7")
47    (eq_attr "type" "multi,savew,flushw,trap"))
48  "n7_single_issue")
49
50;; Most of the instructions executing in the integer unit have a
51;; latency of 1.
52
53(define_insn_reservation "n7_integer" 1
54  (and (eq_attr "cpu" "niagara7")
55    (eq_attr "type" "ialu,ialuX,shift,cmove,compare"))
56  "(n7_slot0 | n7_slot1)")
57
58;; Flushing the instruction memory takes 27 cycles.
59
60(define_insn_reservation "n7_iflush" 27
61  (and (eq_attr "cpu" "niagara7")
62       (eq_attr "type" "iflush"))
63  "(n7_slot0 | n7_slot1), nothing*26")
64
65;; The integer multiplication instructions have a latency of 12 cycles
66;; and execute in the integer unit.
67;;
68;; Likewise for array*, edge* and pdistn instructions.
69
70(define_insn_reservation "n7_imul" 12
71  (and (eq_attr "cpu" "niagara7")
72    (eq_attr "type" "imul,array,edge,edgen,pdistn"))
73  "(n7_slot0 | n7_slot1), nothing*11")
74
75;; The integer division instructions have a latency of 35 cycles and
76;; execute in the integer unit.
77
78(define_insn_reservation "n7_idiv" 35
79  (and (eq_attr "cpu" "niagara7")
80    (eq_attr "type" "idiv"))
81  "(n7_slot0 | n7_slot1), nothing*34")
82
83;; Both integer and floating-point load instructions have a latency of
84;; 5 cycles, and execute in the slot0.
85;;
86;; The prefetch instruction also executes in the load/store unit, but
87;; its latency is only 1 cycle.
88
89(define_insn_reservation "n7_load" 5
90  (and (eq_attr "cpu" "niagara7")
91       (ior (eq_attr "type" "fpload,sload")
92            (and (eq_attr "type" "load")
93                 (eq_attr "subtype" "regular"))))
94  "n7_slot0, nothing*4")
95
96(define_insn_reservation "n7_prefetch" 1
97  (and (eq_attr "cpu" "niagara7")
98       (eq_attr "type" "load")
99       (eq_attr "subtype" "prefetch"))
100  "n7_slot0")
101
102;; Both integer and floating-point store instructions have a latency
103;; of 1 cycle, and execute in the load/store unit in slot0.
104
105(define_insn_reservation "n7_store" 1
106  (and (eq_attr "cpu" "niagara7")
107    (eq_attr "type" "store,fpstore"))
108  "n7_slot0")
109
110;; Control-transfer instructions execute in the Branch Unit in the
111;; slot1.
112
113(define_insn_reservation "n7_cti" 1
114  (and (eq_attr "cpu" "niagara7")
115    (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
116  "n7_slot1")
117
118;; Many instructions executing in the Floating-point and Graphics unit
119;; in the slot1 feature a latency of 11 cycles.
120
121(define_insn_reservation "n7_fp" 11
122  (and (eq_attr "cpu" "niagara7")
123       (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
124            (and (eq_attr "type" "fga")
125                 (eq_attr "subtype" "fpu,maxmin"))))
126  "n7_slot1, nothing*10")
127
128;; Floating-point division and floating-point square-root instructions
129;; have high latencies.  They execute in the floating-point and
130;; graphics unit in the slot1.
131
132
133(define_insn_reservation "n7_fpdivs" 24
134  (and (eq_attr "cpu" "niagara7")
135       (eq_attr "type" "fpdivs,fpsqrts"))
136  "n7_slot1, nothing*23")
137
138(define_insn_reservation "n7_fpdivd" 37
139  (and (eq_attr "cpu" "niagara7")
140    (eq_attr "type" "fpdivd,fpsqrtd"))
141  "n7_slot1, nothing*36")
142
143;; SIMD VIS instructions executing in the Floating-point and graphics
144;; unit (FPG) in slot1 usually have a latency of either 11 or 12
145;; cycles.
146;;
147;; However, the latency for many instructions is only 3 cycles if the
148;; consumer can also be executed in 3 cycles.  We model this with a
149;; bypass.  In these cases the instructions are executed in the
150;; 3-cycle crypto unit which also serves slot1.
151
152(define_insn_reservation "n7_vis_11cycles" 11
153  (and (eq_attr "cpu" "niagara7")
154       (ior (and (eq_attr "type" "fga")
155                 (eq_attr "subtype" "addsub64,other"))
156            (and (eq_attr "type" "vismv")
157                 (eq_attr "subtype" "double,single"))
158            (and (eq_attr "type" "visl")
159                 (eq_attr "subtype" "double,single"))))
160  "n7_slot1, nothing*10")
161
162(define_insn_reservation "n7_vis_12cycles" 12
163  (and (eq_attr "cpu" "niagara7")
164       (ior (eq_attr "type" "bmask,viscmp")
165            (and (eq_attr "type" "fga")
166                 (eq_attr "subtype" "cmask"))
167            (and (eq_attr "type" "vismv")
168                 (eq_attr "subtype" "movstouw"))))
169  "n7_slot1, nothing*11")
170
171(define_bypass 3 "n7_vis_*" "n7_vis_*")
172
173;; Some other VIS instructions have a latency of 12 cycles, and won't
174;; be executed in the 3-cycle crypto pipe.
175
176(define_insn_reservation "n7_lzd" 12
177  (and (eq_attr "cpu" "niagara7")
178       (ior (eq_attr "type" "lzd,")
179            (and (eq_attr "type" "gsr")
180                 (eq_attr "subtype" "alignaddr"))))
181  "n7_slot1, nothing*11")
182
183;; A couple of VIS instructions feature very low latencies in the M7.
184
185(define_insn_reservation "n7_single_vis" 1
186  (and (eq_attr "cpu" "niagara7")
187       (eq_attr "type" "vismv")
188       (eq_attr "subtype" "movxtod"))
189  "n7_slot1")
190
191(define_insn_reservation "n7_double_vis" 2
192  (and (eq_attr "cpu" "niagara7")
193       (eq_attr "type" "vismv")
194       (eq_attr "subtype" "movdtox"))
195  "n7_slot1, nothing")
196
197;; Reading and writing to the gsr register takes a high number of
198;; cycles that is not documented in the PRM.  Let's use the same value
199;; than the M8.
200
201(define_insn_reservation "n7_gsr_reg" 70
202  (and (eq_attr "cpu" "niagara7")
203       (eq_attr "type" "gsr")
204       (eq_attr "subtype" "reg"))
205  "n7_slot1, nothing*70")
206