1 // Pass to fuse CC operations with other instructions.
2 // Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 //
4 // This file is part of GCC.
5 //
6 // GCC is free software; you can redistribute it and/or modify it under
7 // the terms of the GNU General Public License as published by the Free
8 // Software Foundation; either version 3, or (at your option) any later
9 // version.
10 //
11 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 // for more details.
15 //
16 // You should have received a copy of the GNU General Public License
17 // along with GCC; see the file COPYING3.  If not see
18 // <http://www.gnu.org/licenses/>.
19 
20 // This pass looks for sequences of the form:
21 //
22 //    A: (set (reg R1) X1)
23 //    B: ...instructions that might change the value of X1...
24 //    C: (set (reg CC) X2) // X2 uses R1
25 //
26 // and tries to change them to:
27 //
28 //    C': [(set (reg CC) X2')
29 //         (set (reg R1) X1)]
30 //    B: ...instructions that might change the value of X1...
31 //
32 // where X2' is the result of replacing R1 with X1 in X2.
33 //
34 // This sequence occurs in SVE code in two important cases:
35 //
36 // (a) Sometimes, to deal correctly with overflow, we need to increment
37 //     an IV after a WHILELO rather than before it.  In this case:
38 //     - A is a WHILELO,
39 //     - B includes an IV increment and
40 //     - C is a separate PTEST.
41 //
42 // (b) ACLE code of the form:
43 //
44 //       svbool_t ok = svrdffr ();
45 //       if (svptest_last (pg, ok))
46 //         ...
47 //
48 //     must, for performance reasons, be code-generated as:
49 //
50 //       RDFFRS Pok.B, Pg/Z
51 //       ...branch on flags result...
52 //
53 //     without a separate PTEST of Pok.  In this case:
54 //     - A is an aarch64_rdffr
55 //     - B includes an aarch64_update_ffrt
56 //     - C is a separate PTEST
57 //
58 // Combine can handle this optimization if B doesn't exist and if A and
59 // C are in the same BB.  This pass instead handles cases where B does
60 // exist and cases where A and C are in different BBs of the same EBB.
61 
62 #define IN_TARGET_CODE 1
63 
64 #define INCLUDE_ALGORITHM
65 #define INCLUDE_FUNCTIONAL
66 #include "config.h"
67 #include "system.h"
68 #include "coretypes.h"
69 #include "backend.h"
70 #include "rtl.h"
71 #include "df.h"
72 #include "rtl-ssa.h"
73 #include "tree-pass.h"
74 
75 using namespace rtl_ssa;
76 
77 namespace {
78 const pass_data pass_data_cc_fusion =
79 {
80   RTL_PASS, // type
81   "cc_fusion", // name
82   OPTGROUP_NONE, // optinfo_flags
83   TV_NONE, // tv_id
84   0, // properties_required
85   0, // properties_provided
86   0, // properties_destroyed
87   0, // todo_flags_start
88   TODO_df_finish, // todo_flags_finish
89 };
90 
91 // Class that represents one run of the pass.
92 class cc_fusion
93 {
94 public:
cc_fusion()95   cc_fusion ()  : m_parallel () {}
96   void execute ();
97 
98 private:
99   rtx optimizable_set (const insn_info *);
100   bool parallelize_insns (def_info *, rtx, def_info *, rtx);
101   void optimize_cc_setter (def_info *, rtx);
102 
103   // A spare PARALLEL rtx, or null if none.
104   rtx m_parallel;
105 };
106 
107 // See whether INSN is a single_set that we can optimize.  Return the
108 // set if so, otherwise return null.
109 rtx
optimizable_set(const insn_info * insn)110 cc_fusion::optimizable_set (const insn_info *insn)
111 {
112   if (!insn->can_be_optimized ()
113       || insn->is_asm ()
114       || insn->has_volatile_refs ()
115       || insn->has_pre_post_modify ())
116     return NULL_RTX;
117 
118   return single_set (insn->rtl ());
119 }
120 
121 // CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise
122 // a single_set that sets (only) OTHER_DEF.  CC_SET is known to set the
123 // CC register and the instruction that contains CC_SET is known to use
124 // OTHER_DEF.  Try to do CC_SET and OTHER_SET in parallel.
125 bool
parallelize_insns(def_info * cc_def,rtx cc_set,def_info * other_def,rtx other_set)126 cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set,
127 			      def_info *other_def, rtx other_set)
128 {
129   auto attempt = crtl->ssa->new_change_attempt ();
130 
131   insn_info *cc_insn = cc_def->insn ();
132   insn_info *other_insn = other_def->insn ();
133   if (dump_file && (dump_flags & TDF_DETAILS))
134     fprintf (dump_file, "trying to parallelize insn %d and insn %d\n",
135 	     other_insn->uid (), cc_insn->uid ());
136 
137   // Try to substitute OTHER_SET into CC_INSN.
138   insn_change_watermark rtl_watermark;
139   rtx_insn *cc_rtl = cc_insn->rtl ();
140   insn_propagation prop (cc_rtl, SET_DEST (other_set),
141 			 SET_SRC (other_set));
142   if (!prop.apply_to_pattern (&PATTERN (cc_rtl))
143       || prop.num_replacements == 0)
144     {
145       if (dump_file && (dump_flags & TDF_DETAILS))
146 	fprintf (dump_file, "-- failed to substitute all uses of r%d\n",
147 		 other_def->regno ());
148       return false;
149     }
150 
151   // Restrict the uses to those outside notes.
152   use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ());
153   use_array other_set_uses = remove_note_accesses (attempt,
154 						   other_insn->uses ());
155 
156   // Remove the use of the substituted value.
157   access_array_builder uses_builder (attempt);
158   uses_builder.reserve (cc_uses.size ());
159   for (use_info *use : cc_uses)
160     if (use->def () != other_def)
161       uses_builder.quick_push (use);
162   cc_uses = use_array (uses_builder.finish ());
163 
164   // Get the list of uses for the new instruction.
165   insn_change cc_change (cc_insn);
166   cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses);
167   if (!cc_change.new_uses.is_valid ())
168     {
169       if (dump_file && (dump_flags & TDF_DETAILS))
170 	fprintf (dump_file, "-- cannot merge uses\n");
171       return false;
172     }
173 
174   // The instruction initially defines just two registers.  recog can add
175   // extra clobbers if necessary.
176   auto_vec<access_info *, 2> new_defs;
177   new_defs.quick_push (cc_def);
178   new_defs.quick_push (other_def);
179   sort_accesses (new_defs);
180   cc_change.new_defs = def_array (access_array (new_defs));
181 
182   // Make sure there is somewhere that the new instruction could live.
183   auto other_change = insn_change::delete_insn (other_insn);
184   insn_change *changes[] = { &other_change, &cc_change };
185   cc_change.move_range = cc_insn->ebb ()->insn_range ();
186   if (!restrict_movement_ignoring (cc_change, insn_is_changing (changes)))
187     {
188       if (dump_file && (dump_flags & TDF_DETAILS))
189 	fprintf (dump_file, "-- cannot satisfy all definitions and uses\n");
190       return false;
191     }
192 
193   // Tentatively install the new pattern.  By convention, the CC set
194   // must be first.
195   if (m_parallel)
196     {
197       XVECEXP (m_parallel, 0, 0) = cc_set;
198       XVECEXP (m_parallel, 0, 1) = other_set;
199     }
200   else
201     {
202       rtvec vec = gen_rtvec (2, cc_set, other_set);
203       m_parallel = gen_rtx_PARALLEL (VOIDmode, vec);
204     }
205   validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1);
206 
207   // These routines report failures themselves.
208   if (!recog_ignoring (attempt, cc_change, insn_is_changing (changes))
209       || !changes_are_worthwhile (changes)
210       || !crtl->ssa->verify_insn_changes (changes))
211     return false;
212 
213   remove_reg_equal_equiv_notes (cc_rtl);
214   confirm_change_group ();
215   crtl->ssa->change_insns (changes);
216   m_parallel = NULL_RTX;
217   return true;
218 }
219 
220 // Try to optimize the instruction that contains CC_DEF, where CC_DEF describes
221 // a definition of the CC register by CC_SET.
222 void
optimize_cc_setter(def_info * cc_def,rtx cc_set)223 cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set)
224 {
225   // Search the registers used by the CC setter for an easily-substitutable
226   // def-use chain.
227   for (use_info *other_use : cc_def->insn ()->uses ())
228     if (def_info *other_def = other_use->def ())
229       if (other_use->regno () != CC_REGNUM
230 	  && other_def->ebb () == cc_def->ebb ())
231 	if (rtx other_set = optimizable_set (other_def->insn ()))
232 	  {
233 	    rtx dest = SET_DEST (other_set);
234 	    if (REG_P (dest)
235 		&& REGNO (dest) == other_def->regno ()
236 		&& REG_NREGS (dest) == 1
237 		&& parallelize_insns (cc_def, cc_set, other_def, other_set))
238 	      return;
239 	  }
240 }
241 
242 // Run the pass on the current function.
243 void
execute()244 cc_fusion::execute ()
245 {
246   // Initialization.
247   calculate_dominance_info (CDI_DOMINATORS);
248   df_analyze ();
249   crtl->ssa = new rtl_ssa::function_info (cfun);
250 
251   // Walk through all instructions that set CC.  Look for a PTEST instruction
252   // that we can optimize.
253   //
254   // ??? The PTEST test isn't needed for correctness, but it ensures that the
255   // pass no effect on non-SVE code.
256   for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM))
257     if (rtx cc_set = optimizable_set (def->insn ()))
258       if (REG_P (SET_DEST (cc_set))
259 	  && REGNO (SET_DEST (cc_set)) == CC_REGNUM
260 	  && GET_CODE (SET_SRC (cc_set)) == UNSPEC
261 	  && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST)
262 	optimize_cc_setter (def, cc_set);
263 
264   // Finalization.
265   crtl->ssa->perform_pending_updates ();
266   free_dominance_info (CDI_DOMINATORS);
267 }
268 
269 class pass_cc_fusion : public rtl_opt_pass
270 {
271 public:
pass_cc_fusion(gcc::context * ctxt)272   pass_cc_fusion (gcc::context *ctxt)
273     : rtl_opt_pass (pass_data_cc_fusion, ctxt)
274   {}
275 
276   // opt_pass methods:
gate(function *)277   virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; }
278   virtual unsigned int execute (function *);
279 };
280 
281 unsigned int
execute(function *)282 pass_cc_fusion::execute (function *)
283 {
284   cc_fusion ().execute ();
285   return 0;
286 }
287 
288 } // end namespace
289 
290 // Create a new CC fusion pass instance.
291 
292 rtl_opt_pass *
make_pass_cc_fusion(gcc::context * ctxt)293 make_pass_cc_fusion (gcc::context *ctxt)
294 {
295   return new pass_cc_fusion (ctxt);
296 }
297