1 /*
2  * Copyright (c) 2007, 2020, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.
8  *
9  * This code is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12  * version 2 for more details (a copy is included in the LICENSE file that
13  * accompanied this code).
14  *
15  * You should have received a copy of the GNU General Public License version
16  * 2 along with this work; if not, write to the Free Software Foundation,
17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18  *
19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20  * or visit www.oracle.com if you need additional information or have any
21  * questions.
22  */
23 
24 #include "precompiled.hpp"
25 #include "compiler/compileLog.hpp"
26 #include "libadt/vectset.hpp"
27 #include "memory/allocation.inline.hpp"
28 #include "memory/resourceArea.hpp"
29 #include "opto/addnode.hpp"
30 #include "opto/callnode.hpp"
31 #include "opto/castnode.hpp"
32 #include "opto/convertnode.hpp"
33 #include "opto/divnode.hpp"
34 #include "opto/matcher.hpp"
35 #include "opto/memnode.hpp"
36 #include "opto/mulnode.hpp"
37 #include "opto/opcodes.hpp"
38 #include "opto/opaquenode.hpp"
39 #include "opto/superword.hpp"
40 #include "opto/vectornode.hpp"
41 #include "opto/movenode.hpp"
42 #include "utilities/powerOfTwo.hpp"
43 
44 //
45 //                  S U P E R W O R D   T R A N S F O R M
46 //=============================================================================
47 
48 //------------------------------SuperWord---------------------------
SuperWord(PhaseIdealLoop * phase)49 SuperWord::SuperWord(PhaseIdealLoop* phase) :
50   _phase(phase),
51   _arena(phase->C->comp_arena()),
52   _igvn(phase->_igvn),
53   _packset(arena(), 8,  0, NULL),         // packs for the current block
54   _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
55   _block(arena(), 8,  0, NULL),           // nodes in current block
56   _post_block(arena(), 8, 0, NULL),       // nodes common to current block which are marked as post loop vectorizable
57   _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
58   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
59   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
60   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
61   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
62   _cmovev_kit(_arena, this),              // map to facilitate CMoveV creation
63   _align_to_ref(NULL),                    // memory reference to align vectors to
64   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
65   _dg(_arena),                            // dependence graph
66   _visited(arena()),                      // visited node set
67   _post_visited(arena()),                 // post visited node set
68   _n_idx_list(arena(), 8),                // scratch list of (node,index) pairs
69   _nlist(arena(), 8, 0, NULL),            // scratch list of nodes
70   _stk(arena(), 8, 0, NULL),              // scratch stack of nodes
71   _lpt(NULL),                             // loop tree node
72   _lp(NULL),                              // CountedLoopNode
73   _pre_loop_end(NULL),                    // Pre loop CountedLoopEndNode
74   _bb(NULL),                              // basic block
75   _iv(NULL),                              // induction var
76   _race_possible(false),                  // cases where SDMU is true
77   _early_return(true),                    // analysis evaluations routine
78   _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
79   _do_reserve_copy(DoReserveCopyInSuperWord),
80   _num_work_vecs(0),                      // amount of vector work we have
81   _num_reductions(0),                     // amount of reduction work we have
82   _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
83   _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
84   _ii_order(arena(), 8, 0, 0)
85 {
86 #ifndef PRODUCT
87   _vector_loop_debug = 0;
88   if (_phase->C->method() != NULL) {
89     _vector_loop_debug = phase->C->directive()->VectorizeDebugOption;
90   }
91 
92 #endif
93 }
94 
95 static const bool _do_vector_loop_experimental = false; // Experimental vectorization which uses data from loop unrolling.
96 
97 //------------------------------transform_loop---------------------------
transform_loop(IdealLoopTree * lpt,bool do_optimization)98 void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
99   assert(UseSuperWord, "should be");
100   // SuperWord only works with power of two vector sizes.
101   int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
102   if (vector_width < 2 || !is_power_of_2(vector_width)) {
103     return;
104   }
105 
106   assert(lpt->_head->is_CountedLoop(), "must be");
107   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
108 
109   if (!cl->is_valid_counted_loop(T_INT)) return; // skip malformed counted loop
110 
111   bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
112   if (post_loop_allowed) {
113     if (cl->is_reduction_loop()) return; // no predication mapping
114     Node *limit = cl->limit();
115     if (limit->is_Con()) return; // non constant limits only
116     // Now check the limit for expressions we do not handle
117     if (limit->is_Add()) {
118       Node *in2 = limit->in(2);
119       if (in2->is_Con()) {
120         int val = in2->get_int();
121         // should not try to program these cases
122         if (val < 0) return;
123       }
124     }
125   }
126 
127   // skip any loop that has not been assigned max unroll by analysis
128   if (do_optimization) {
129     if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == 0) return;
130   }
131 
132   // Check for no control flow in body (other than exit)
133   Node *cl_exit = cl->loopexit();
134   if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
135     #ifndef PRODUCT
136       if (TraceSuperWord) {
137         tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
138         tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
139         tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
140         tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
141         lpt->dump_head();
142       }
143     #endif
144     return;
145   }
146 
147   // Make sure the are no extra control users of the loop backedge
148   if (cl->back_control()->outcnt() != 1) {
149     return;
150   }
151 
152   // Skip any loops already optimized by slp
153   if (cl->is_vectorized_loop()) return;
154 
155   if (cl->is_unroll_only()) return;
156 
157   if (cl->is_main_loop()) {
158     // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
159     CountedLoopEndNode* pre_end = find_pre_loop_end(cl);
160     if (pre_end == NULL) {
161       return;
162     }
163     Node* pre_opaq1 = pre_end->limit();
164     if (pre_opaq1->Opcode() != Op_Opaque1) {
165       return;
166     }
167     set_pre_loop_end(pre_end);
168   }
169 
170   init(); // initialize data structures
171 
172   set_lpt(lpt);
173   set_lp(cl);
174 
175   // For now, define one block which is the entire loop body
176   set_bb(cl);
177 
178   if (do_optimization) {
179     assert(_packset.length() == 0, "packset must be empty");
180     SLP_extract();
181     if (PostLoopMultiversioning && Matcher::has_predicated_vectors()) {
182       if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
183         IdealLoopTree *lpt_next = lpt->_next;
184         CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
185         _phase->has_range_checks(lpt_next);
186         if (cl_next->is_post_loop() && !cl_next->range_checks_present()) {
187           if (!cl_next->is_vectorized_loop()) {
188             int slp_max_unroll_factor = cl->slp_max_unroll();
189             cl_next->set_slp_max_unroll(slp_max_unroll_factor);
190           }
191         }
192       }
193     }
194   }
195 }
196 
197 //------------------------------early unrolling analysis------------------------------
unrolling_analysis(int & local_loop_unroll_factor)198 void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
199   bool is_slp = true;
200   ResourceMark rm;
201   size_t ignored_size = lpt()->_body.size();
202   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
203   Node_Stack nstack((int)ignored_size);
204   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
205   Node *cl_exit = cl->loopexit_or_null();
206   int rpo_idx = _post_block.length();
207 
208   assert(rpo_idx == 0, "post loop block is empty");
209 
210   // First clear the entries
211   for (uint i = 0; i < lpt()->_body.size(); i++) {
212     ignored_loop_nodes[i] = -1;
213   }
214 
215   int max_vector = Matcher::max_vector_size(T_BYTE);
216   bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
217 
218   // Process the loop, some/all of the stack entries will not be in order, ergo
219   // need to preprocess the ignored initial state before we process the loop
220   for (uint i = 0; i < lpt()->_body.size(); i++) {
221     Node* n = lpt()->_body.at(i);
222     if (n == cl->incr() ||
223       n->is_reduction() ||
224       n->is_AddP() ||
225       n->is_Cmp() ||
226       n->is_IfTrue() ||
227       n->is_CountedLoop() ||
228       (n == cl_exit)) {
229       ignored_loop_nodes[i] = n->_idx;
230       continue;
231     }
232 
233     if (n->is_If()) {
234       IfNode *iff = n->as_If();
235       if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
236         if (lpt()->is_loop_exit(iff)) {
237           ignored_loop_nodes[i] = n->_idx;
238           continue;
239         }
240       }
241     }
242 
243     if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
244       Node* n_tail = n->in(LoopNode::LoopBackControl);
245       if (n_tail != n->in(LoopNode::EntryControl)) {
246         if (!n_tail->is_Mem()) {
247           is_slp = false;
248           break;
249         }
250       }
251     }
252 
253     // This must happen after check of phi/if
254     if (n->is_Phi() || n->is_If()) {
255       ignored_loop_nodes[i] = n->_idx;
256       continue;
257     }
258 
259     if (n->is_LoadStore() || n->is_MergeMem() ||
260       (n->is_Proj() && !n->as_Proj()->is_CFG())) {
261       is_slp = false;
262       break;
263     }
264 
265     // Ignore nodes with non-primitive type.
266     BasicType bt;
267     if (n->is_Mem()) {
268       bt = n->as_Mem()->memory_type();
269     } else {
270       bt = n->bottom_type()->basic_type();
271     }
272     if (is_java_primitive(bt) == false) {
273       ignored_loop_nodes[i] = n->_idx;
274       continue;
275     }
276 
277     if (n->is_Mem()) {
278       MemNode* current = n->as_Mem();
279       Node* adr = n->in(MemNode::Address);
280       Node* n_ctrl = _phase->get_ctrl(adr);
281 
282       // save a queue of post process nodes
283       if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
284         // Process the memory expression
285         int stack_idx = 0;
286         bool have_side_effects = true;
287         if (adr->is_AddP() == false) {
288           nstack.push(adr, stack_idx++);
289         } else {
290           // Mark the components of the memory operation in nstack
291           SWPointer p1(current, this, &nstack, true);
292           have_side_effects = p1.node_stack()->is_nonempty();
293         }
294 
295         // Process the pointer stack
296         while (have_side_effects) {
297           Node* pointer_node = nstack.node();
298           for (uint j = 0; j < lpt()->_body.size(); j++) {
299             Node* cur_node = lpt()->_body.at(j);
300             if (cur_node == pointer_node) {
301               ignored_loop_nodes[j] = cur_node->_idx;
302               break;
303             }
304           }
305           nstack.pop();
306           have_side_effects = nstack.is_nonempty();
307         }
308       }
309     }
310   }
311 
312   if (is_slp) {
313     // Now we try to find the maximum supported consistent vector which the machine
314     // description can use
315     bool small_basic_type = false;
316     bool flag_small_bt = false;
317     for (uint i = 0; i < lpt()->_body.size(); i++) {
318       if (ignored_loop_nodes[i] != -1) continue;
319 
320       BasicType bt;
321       Node* n = lpt()->_body.at(i);
322       if (n->is_Mem()) {
323         bt = n->as_Mem()->memory_type();
324       } else {
325         bt = n->bottom_type()->basic_type();
326       }
327 
328       if (post_loop_allowed) {
329         if (!small_basic_type) {
330           switch (bt) {
331           case T_CHAR:
332           case T_BYTE:
333           case T_SHORT:
334             small_basic_type = true;
335             break;
336 
337           case T_LONG:
338             // TODO: Remove when support completed for mask context with LONG.
339             //       Support needs to be augmented for logical qword operations, currently we map to dword
340             //       buckets for vectors on logicals as these were legacy.
341             small_basic_type = true;
342             break;
343 
344           default:
345             break;
346           }
347         }
348       }
349 
350       if (is_java_primitive(bt) == false) continue;
351 
352          int cur_max_vector = Matcher::max_vector_size(bt);
353 
354       // If a max vector exists which is not larger than _local_loop_unroll_factor
355       // stop looking, we already have the max vector to map to.
356       if (cur_max_vector < local_loop_unroll_factor) {
357         is_slp = false;
358         if (TraceSuperWordLoopUnrollAnalysis) {
359           tty->print_cr("slp analysis fails: unroll limit greater than max vector\n");
360         }
361         break;
362       }
363 
364       // Map the maximal common vector
365       if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
366         if (cur_max_vector < max_vector && !flag_small_bt) {
367           max_vector = cur_max_vector;
368         } else if (cur_max_vector > max_vector && UseSubwordForMaxVector) {
369           // Analyse subword in the loop to set maximum vector size to take advantage of full vector width for subword types.
370           // Here we analyze if narrowing is likely to happen and if it is we set vector size more aggressively.
371           // We check for possibility of narrowing by looking through chain operations using subword types.
372           if (is_subword_type(bt)) {
373             uint start, end;
374             VectorNode::vector_operands(n, &start, &end);
375 
376             for (uint j = start; j < end; j++) {
377               Node* in = n->in(j);
378               // Don't propagate through a memory
379               if (!in->is_Mem() && in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
380                 bool same_type = true;
381                 for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
382                   Node *use = in->fast_out(k);
383                   if (!in_bb(use) && use->bottom_type()->basic_type() != bt) {
384                     same_type = false;
385                     break;
386                   }
387                 }
388                 if (same_type) {
389                   max_vector = cur_max_vector;
390                   flag_small_bt = true;
391                   cl->mark_subword_loop();
392                 }
393               }
394             }
395           }
396         }
397         // We only process post loops on predicated targets where we want to
398         // mask map the loop to a single iteration
399         if (post_loop_allowed) {
400           _post_block.at_put_grow(rpo_idx++, n);
401         }
402       }
403     }
404     if (is_slp) {
405       local_loop_unroll_factor = max_vector;
406       cl->mark_passed_slp();
407     }
408     cl->mark_was_slp();
409     if (cl->is_main_loop()) {
410       cl->set_slp_max_unroll(local_loop_unroll_factor);
411     } else if (post_loop_allowed) {
412       if (!small_basic_type) {
413         // avoid replication context for small basic types in programmable masked loops
414         cl->set_slp_max_unroll(local_loop_unroll_factor);
415       }
416     }
417   }
418 }
419 
420 //------------------------------SLP_extract---------------------------
421 // Extract the superword level parallelism
422 //
423 // 1) A reverse post-order of nodes in the block is constructed.  By scanning
424 //    this list from first to last, all definitions are visited before their uses.
425 //
426 // 2) A point-to-point dependence graph is constructed between memory references.
427 //    This simplies the upcoming "independence" checker.
428 //
429 // 3) The maximum depth in the node graph from the beginning of the block
430 //    to each node is computed.  This is used to prune the graph search
431 //    in the independence checker.
432 //
433 // 4) For integer types, the necessary bit width is propagated backwards
434 //    from stores to allow packed operations on byte, char, and short
435 //    integers.  This reverses the promotion to type "int" that javac
436 //    did for operations like: char c1,c2,c3;  c1 = c2 + c3.
437 //
438 // 5) One of the memory references is picked to be an aligned vector reference.
439 //    The pre-loop trip count is adjusted to align this reference in the
440 //    unrolled body.
441 //
442 // 6) The initial set of pack pairs is seeded with memory references.
443 //
444 // 7) The set of pack pairs is extended by following use->def and def->use links.
445 //
446 // 8) The pairs are combined into vector sized packs.
447 //
448 // 9) Reorder the memory slices to co-locate members of the memory packs.
449 //
450 // 10) Generate ideal vector nodes for the final set of packs and where necessary,
451 //    inserting scalar promotion, vector creation from multiple scalars, and
452 //    extraction of scalar values from vectors.
453 //
SLP_extract()454 void SuperWord::SLP_extract() {
455 
456 #ifndef PRODUCT
457   if (_do_vector_loop && TraceSuperWord) {
458     tty->print("SuperWord::SLP_extract\n");
459     tty->print("input loop\n");
460     _lpt->dump_head();
461     _lpt->dump();
462     for (uint i = 0; i < _lpt->_body.size(); i++) {
463       _lpt->_body.at(i)->dump();
464     }
465   }
466 #endif
467   // Ready the block
468   if (!construct_bb()) {
469     return; // Exit if no interesting nodes or complex graph.
470   }
471 
472   // build    _dg, _disjoint_ptrs
473   dependence_graph();
474 
475   // compute function depth(Node*)
476   compute_max_depth();
477 
478   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
479   bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
480   if (cl->is_main_loop()) {
481     if (_do_vector_loop_experimental) {
482       if (mark_generations() != -1) {
483         hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
484 
485         if (!construct_bb()) {
486           return; // Exit if no interesting nodes or complex graph.
487         }
488         dependence_graph();
489         compute_max_depth();
490       }
491 
492 #ifndef PRODUCT
493       if (TraceSuperWord) {
494         tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
495         _lpt->dump_head();
496         for (int j = 0; j < _block.length(); j++) {
497           Node* n = _block.at(j);
498           int d = depth(n);
499           for (int i = 0; i < d; i++) tty->print("%s", "  ");
500           tty->print("%d :", d);
501           n->dump();
502         }
503       }
504 #endif
505     }
506 
507     compute_vector_element_type();
508 
509     // Attempt vectorization
510 
511     find_adjacent_refs();
512 
513     if (align_to_ref() == NULL) {
514       return; // Did not find memory reference to align vectors
515     }
516 
517     extend_packlist();
518 
519     if (_do_vector_loop_experimental) {
520       if (_packset.length() == 0) {
521 #ifndef PRODUCT
522         if (TraceSuperWord) {
523           tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
524         }
525 #endif
526         pack_parallel();
527       }
528     }
529 
530     combine_packs();
531 
532     construct_my_pack_map();
533     if (UseVectorCmov) {
534       merge_packs_to_cmovd();
535     }
536 
537     filter_packs();
538 
539     schedule();
540   } else if (post_loop_allowed) {
541     int saved_mapped_unroll_factor = cl->slp_max_unroll();
542     if (saved_mapped_unroll_factor) {
543       int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
544 
545       // now reset the slp_unroll_factor so that we can check the analysis mapped
546       // what the vector loop was mapped to
547       cl->set_slp_max_unroll(0);
548 
549       // do the analysis on the post loop
550       unrolling_analysis(vector_mapped_unroll_factor);
551 
552       // if our analyzed loop is a canonical fit, start processing it
553       if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
554         // now add the vector nodes to packsets
555         for (int i = 0; i < _post_block.length(); i++) {
556           Node* n = _post_block.at(i);
557           Node_List* singleton = new Node_List();
558           singleton->push(n);
559           _packset.append(singleton);
560           set_my_pack(n, singleton);
561         }
562 
563         // map base types for vector usage
564         compute_vector_element_type();
565       } else {
566         return;
567       }
568     } else {
569       // for some reason we could not map the slp analysis state of the vectorized loop
570       return;
571     }
572   }
573 
574   output();
575 }
576 
577 //------------------------------find_adjacent_refs---------------------------
578 // Find the adjacent memory references and create pack pairs for them.
579 // This is the initial set of packs that will then be extended by
580 // following use->def and def->use links.  The align positions are
581 // assigned relative to the reference "align_to_ref"
find_adjacent_refs()582 void SuperWord::find_adjacent_refs() {
583   // Get list of memory operations
584   Node_List memops;
585   for (int i = 0; i < _block.length(); i++) {
586     Node* n = _block.at(i);
587     if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
588         is_java_primitive(n->as_Mem()->memory_type())) {
589       int align = memory_alignment(n->as_Mem(), 0);
590       if (align != bottom_align) {
591         memops.push(n);
592       }
593     }
594   }
595   if (TraceSuperWord) {
596     tty->print_cr("\nfind_adjacent_refs found %d memops", memops.size());
597   }
598 
599   Node_List align_to_refs;
600   int max_idx;
601   int best_iv_adjustment = 0;
602   MemNode* best_align_to_mem_ref = NULL;
603 
604   while (memops.size() != 0) {
605     // Find a memory reference to align to.
606     MemNode* mem_ref = find_align_to_ref(memops, max_idx);
607     if (mem_ref == NULL) break;
608     align_to_refs.push(mem_ref);
609     int iv_adjustment = get_iv_adjustment(mem_ref);
610 
611     if (best_align_to_mem_ref == NULL) {
612       // Set memory reference which is the best from all memory operations
613       // to be used for alignment. The pre-loop trip count is modified to align
614       // this reference to a vector-aligned address.
615       best_align_to_mem_ref = mem_ref;
616       best_iv_adjustment = iv_adjustment;
617       NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
618     }
619 
620     SWPointer align_to_ref_p(mem_ref, this, NULL, false);
621     // Set alignment relative to "align_to_ref" for all related memory operations.
622     for (int i = memops.size() - 1; i >= 0; i--) {
623       MemNode* s = memops.at(i)->as_Mem();
624       if (isomorphic(s, mem_ref) &&
625            (!_do_vector_loop || same_origin_idx(s, mem_ref))) {
626         SWPointer p2(s, this, NULL, false);
627         if (p2.comparable(align_to_ref_p)) {
628           int align = memory_alignment(s, iv_adjustment);
629           set_alignment(s, align);
630         }
631       }
632     }
633 
634     // Create initial pack pairs of memory operations for which
635     // alignment is set and vectors will be aligned.
636     bool create_pack = true;
637     if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
638       if (vectors_should_be_aligned()) {
639         int vw = vector_width(mem_ref);
640         int vw_best = vector_width(best_align_to_mem_ref);
641         if (vw > vw_best) {
642           // Do not vectorize a memory access with more elements per vector
643           // if unaligned memory access is not allowed because number of
644           // iterations in pre-loop will be not enough to align it.
645           create_pack = false;
646         } else {
647           SWPointer p2(best_align_to_mem_ref, this, NULL, false);
648           if (!align_to_ref_p.invar_equals(p2)) {
649             // Do not vectorize memory accesses with different invariants
650             // if unaligned memory accesses are not allowed.
651             create_pack = false;
652           }
653         }
654       }
655     } else {
656       if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
657         // Can't allow vectorization of unaligned memory accesses with the
658         // same type since it could be overlapped accesses to the same array.
659         create_pack = false;
660       } else {
661         // Allow independent (different type) unaligned memory operations
662         // if HW supports them.
663         if (vectors_should_be_aligned()) {
664           create_pack = false;
665         } else {
666           // Check if packs of the same memory type but
667           // with a different alignment were created before.
668           for (uint i = 0; i < align_to_refs.size(); i++) {
669             MemNode* mr = align_to_refs.at(i)->as_Mem();
670             if (mr == mem_ref) {
671               // Skip when we are looking at same memory operation.
672               continue;
673             }
674             if (same_velt_type(mr, mem_ref) &&
675                 memory_alignment(mr, iv_adjustment) != 0)
676               create_pack = false;
677           }
678         }
679       }
680     }
681     if (create_pack) {
682       for (uint i = 0; i < memops.size(); i++) {
683         Node* s1 = memops.at(i);
684         int align = alignment(s1);
685         if (align == top_align) continue;
686         for (uint j = 0; j < memops.size(); j++) {
687           Node* s2 = memops.at(j);
688           if (alignment(s2) == top_align) continue;
689           if (s1 != s2 && are_adjacent_refs(s1, s2)) {
690             if (stmts_can_pack(s1, s2, align)) {
691               Node_List* pair = new Node_List();
692               pair->push(s1);
693               pair->push(s2);
694               if (!_do_vector_loop || same_origin_idx(s1, s2)) {
695                 _packset.append(pair);
696               }
697             }
698           }
699         }
700       }
701     } else { // Don't create unaligned pack
702       // First, remove remaining memory ops of the same type from the list.
703       for (int i = memops.size() - 1; i >= 0; i--) {
704         MemNode* s = memops.at(i)->as_Mem();
705         if (same_velt_type(s, mem_ref)) {
706           memops.remove(i);
707         }
708       }
709 
710       // Second, remove already constructed packs of the same type.
711       for (int i = _packset.length() - 1; i >= 0; i--) {
712         Node_List* p = _packset.at(i);
713         MemNode* s = p->at(0)->as_Mem();
714         if (same_velt_type(s, mem_ref)) {
715           remove_pack_at(i);
716         }
717       }
718 
719       // If needed find the best memory reference for loop alignment again.
720       if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
721         // Put memory ops from remaining packs back on memops list for
722         // the best alignment search.
723         uint orig_msize = memops.size();
724         for (int i = 0; i < _packset.length(); i++) {
725           Node_List* p = _packset.at(i);
726           MemNode* s = p->at(0)->as_Mem();
727           assert(!same_velt_type(s, mem_ref), "sanity");
728           memops.push(s);
729         }
730         best_align_to_mem_ref = find_align_to_ref(memops, max_idx);
731         if (best_align_to_mem_ref == NULL) {
732           if (TraceSuperWord) {
733             tty->print_cr("SuperWord::find_adjacent_refs(): best_align_to_mem_ref == NULL");
734           }
735           // best_align_to_mem_ref will be used for adjusting the pre-loop limit in
736           // SuperWord::align_initial_loop_index. Find one with the biggest vector size,
737           // smallest data size and smallest iv offset from memory ops from remaining packs.
738           if (_packset.length() > 0) {
739             if (orig_msize == 0) {
740               best_align_to_mem_ref = memops.at(max_idx)->as_Mem();
741             } else {
742               for (uint i = 0; i < orig_msize; i++) {
743                 memops.remove(0);
744               }
745               best_align_to_mem_ref = find_align_to_ref(memops, max_idx);
746               assert(best_align_to_mem_ref == NULL, "sanity");
747               best_align_to_mem_ref = memops.at(max_idx)->as_Mem();
748             }
749             assert(best_align_to_mem_ref != NULL, "sanity");
750           }
751           break;
752         }
753         best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
754         NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
755         // Restore list.
756         while (memops.size() > orig_msize)
757           (void)memops.pop();
758       }
759     } // unaligned memory accesses
760 
761     // Remove used mem nodes.
762     for (int i = memops.size() - 1; i >= 0; i--) {
763       MemNode* m = memops.at(i)->as_Mem();
764       if (alignment(m) != top_align) {
765         memops.remove(i);
766       }
767     }
768 
769   } // while (memops.size() != 0
770   set_align_to_ref(best_align_to_mem_ref);
771 
772   if (TraceSuperWord) {
773     tty->print_cr("\nAfter find_adjacent_refs");
774     print_packset();
775   }
776 }
777 
778 #ifndef PRODUCT
find_adjacent_refs_trace_1(Node * best_align_to_mem_ref,int best_iv_adjustment)779 void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment) {
780   if (is_trace_adjacent()) {
781     tty->print("SuperWord::find_adjacent_refs best_align_to_mem_ref = %d, best_iv_adjustment = %d",
782        best_align_to_mem_ref->_idx, best_iv_adjustment);
783        best_align_to_mem_ref->dump();
784   }
785 }
786 #endif
787 
788 //------------------------------find_align_to_ref---------------------------
789 // Find a memory reference to align the loop induction variable to.
790 // Looks first at stores then at loads, looking for a memory reference
791 // with the largest number of references similar to it.
find_align_to_ref(Node_List & memops,int & idx)792 MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
793   GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
794 
795   // Count number of comparable memory ops
796   for (uint i = 0; i < memops.size(); i++) {
797     MemNode* s1 = memops.at(i)->as_Mem();
798     SWPointer p1(s1, this, NULL, false);
799     // Only discard unalignable memory references if vector memory references
800     // should be aligned on this platform.
801     if (vectors_should_be_aligned() && !ref_is_alignable(p1)) {
802       *cmp_ct.adr_at(i) = 0;
803       continue;
804     }
805     for (uint j = i+1; j < memops.size(); j++) {
806       MemNode* s2 = memops.at(j)->as_Mem();
807       if (isomorphic(s1, s2)) {
808         SWPointer p2(s2, this, NULL, false);
809         if (p1.comparable(p2)) {
810           (*cmp_ct.adr_at(i))++;
811           (*cmp_ct.adr_at(j))++;
812         }
813       }
814     }
815   }
816 
817   // Find Store (or Load) with the greatest number of "comparable" references,
818   // biggest vector size, smallest data size and smallest iv offset.
819   int max_ct        = 0;
820   int max_vw        = 0;
821   int max_idx       = -1;
822   int min_size      = max_jint;
823   int min_iv_offset = max_jint;
824   for (uint j = 0; j < memops.size(); j++) {
825     MemNode* s = memops.at(j)->as_Mem();
826     if (s->is_Store()) {
827       int vw = vector_width_in_bytes(s);
828       assert(vw > 1, "sanity");
829       SWPointer p(s, this, NULL, false);
830       if ( cmp_ct.at(j) >  max_ct ||
831           (cmp_ct.at(j) == max_ct &&
832             ( vw >  max_vw ||
833              (vw == max_vw &&
834               ( data_size(s) <  min_size ||
835                (data_size(s) == min_size &&
836                 p.offset_in_bytes() < min_iv_offset)))))) {
837         max_ct = cmp_ct.at(j);
838         max_vw = vw;
839         max_idx = j;
840         min_size = data_size(s);
841         min_iv_offset = p.offset_in_bytes();
842       }
843     }
844   }
845   // If no stores, look at loads
846   if (max_ct == 0) {
847     for (uint j = 0; j < memops.size(); j++) {
848       MemNode* s = memops.at(j)->as_Mem();
849       if (s->is_Load()) {
850         int vw = vector_width_in_bytes(s);
851         assert(vw > 1, "sanity");
852         SWPointer p(s, this, NULL, false);
853         if ( cmp_ct.at(j) >  max_ct ||
854             (cmp_ct.at(j) == max_ct &&
855               ( vw >  max_vw ||
856                (vw == max_vw &&
857                 ( data_size(s) <  min_size ||
858                  (data_size(s) == min_size &&
859                   p.offset_in_bytes() < min_iv_offset)))))) {
860           max_ct = cmp_ct.at(j);
861           max_vw = vw;
862           max_idx = j;
863           min_size = data_size(s);
864           min_iv_offset = p.offset_in_bytes();
865         }
866       }
867     }
868   }
869 
870 #ifdef ASSERT
871   if (TraceSuperWord && Verbose) {
872     tty->print_cr("\nVector memops after find_align_to_ref");
873     for (uint i = 0; i < memops.size(); i++) {
874       MemNode* s = memops.at(i)->as_Mem();
875       s->dump();
876     }
877   }
878 #endif
879 
880   idx = max_idx;
881   if (max_ct > 0) {
882 #ifdef ASSERT
883     if (TraceSuperWord) {
884       tty->print("\nVector align to node: ");
885       memops.at(max_idx)->as_Mem()->dump();
886     }
887 #endif
888     return memops.at(max_idx)->as_Mem();
889   }
890   return NULL;
891 }
892 
893 //------------------span_works_for_memory_size-----------------------------
span_works_for_memory_size(MemNode * mem,int span,int mem_size,int offset)894 static bool span_works_for_memory_size(MemNode* mem, int span, int mem_size, int offset) {
895   bool span_matches_memory = false;
896   if ((mem_size == type2aelembytes(T_BYTE) || mem_size == type2aelembytes(T_SHORT))
897     && ABS(span) == type2aelembytes(T_INT)) {
898     // There is a mismatch on span size compared to memory.
899     for (DUIterator_Fast jmax, j = mem->fast_outs(jmax); j < jmax; j++) {
900       Node* use = mem->fast_out(j);
901       if (!VectorNode::is_type_transition_to_int(use)) {
902         return false;
903       }
904     }
905     // If all uses transition to integer, it means that we can successfully align even on mismatch.
906     return true;
907   }
908   else {
909     span_matches_memory = ABS(span) == mem_size;
910   }
911   return span_matches_memory && (ABS(offset) % mem_size) == 0;
912 }
913 
914 //------------------------------ref_is_alignable---------------------------
915 // Can the preloop align the reference to position zero in the vector?
ref_is_alignable(SWPointer & p)916 bool SuperWord::ref_is_alignable(SWPointer& p) {
917   if (!p.has_iv()) {
918     return true;   // no induction variable
919   }
920   CountedLoopEndNode* pre_end = pre_loop_end();
921   assert(pre_end->stride_is_con(), "pre loop stride is constant");
922   int preloop_stride = pre_end->stride_con();
923 
924   int span = preloop_stride * p.scale_in_bytes();
925   int mem_size = p.memory_size();
926   int offset   = p.offset_in_bytes();
927   // Stride one accesses are alignable if offset is aligned to memory operation size.
928   // Offset can be unaligned when UseUnalignedAccesses is used.
929   if (span_works_for_memory_size(p.mem(), span, mem_size, offset)) {
930     return true;
931   }
932   // If the initial offset from start of the object is computable,
933   // check if the pre-loop can align the final offset accordingly.
934   //
935   // In other words: Can we find an i such that the offset
936   // after i pre-loop iterations is aligned to vw?
937   //   (init_offset + pre_loop) % vw == 0              (1)
938   // where
939   //   pre_loop = i * span
940   // is the number of bytes added to the offset by i pre-loop iterations.
941   //
942   // For this to hold we need pre_loop to increase init_offset by
943   //   pre_loop = vw - (init_offset % vw)
944   //
945   // This is only possible if pre_loop is divisible by span because each
946   // pre-loop iteration increases the initial offset by 'span' bytes:
947   //   (vw - (init_offset % vw)) % span == 0
948   //
949   int vw = vector_width_in_bytes(p.mem());
950   assert(vw > 1, "sanity");
951   Node* init_nd = pre_end->init_trip();
952   if (init_nd->is_Con() && p.invar() == NULL) {
953     int init = init_nd->bottom_type()->is_int()->get_con();
954     int init_offset = init * p.scale_in_bytes() + offset;
955     if (init_offset < 0) { // negative offset from object start?
956       return false;        // may happen in dead loop
957     }
958     if (vw % span == 0) {
959       // If vm is a multiple of span, we use formula (1).
960       if (span > 0) {
961         return (vw - (init_offset % vw)) % span == 0;
962       } else {
963         assert(span < 0, "nonzero stride * scale");
964         return (init_offset % vw) % -span == 0;
965       }
966     } else if (span % vw == 0) {
967       // If span is a multiple of vw, we can simplify formula (1) to:
968       //   (init_offset + i * span) % vw == 0
969       //     =>
970       //   (init_offset % vw) + ((i * span) % vw) == 0
971       //     =>
972       //   init_offset % vw == 0
973       //
974       // Because we add a multiple of vw to the initial offset, the final
975       // offset is a multiple of vw if and only if init_offset is a multiple.
976       //
977       return (init_offset % vw) == 0;
978     }
979   }
980   return false;
981 }
982 //---------------------------get_vw_bytes_special------------------------
get_vw_bytes_special(MemNode * s)983 int SuperWord::get_vw_bytes_special(MemNode* s) {
984   // Get the vector width in bytes.
985   int vw = vector_width_in_bytes(s);
986 
987   // Check for special case where there is an MulAddS2I usage where short vectors are going to need combined.
988   BasicType btype = velt_basic_type(s);
989   if (type2aelembytes(btype) == 2) {
990     bool should_combine_adjacent = true;
991     for (DUIterator_Fast imax, i = s->fast_outs(imax); i < imax; i++) {
992       Node* user = s->fast_out(i);
993       if (!VectorNode::is_muladds2i(user)) {
994         should_combine_adjacent = false;
995       }
996     }
997     if (should_combine_adjacent) {
998       vw = MIN2(Matcher::max_vector_size(btype)*type2aelembytes(btype), vw * 2);
999     }
1000   }
1001 
1002   return vw;
1003 }
1004 
1005 //---------------------------get_iv_adjustment---------------------------
1006 // Calculate loop's iv adjustment for this memory ops.
get_iv_adjustment(MemNode * mem_ref)1007 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
1008   SWPointer align_to_ref_p(mem_ref, this, NULL, false);
1009   int offset = align_to_ref_p.offset_in_bytes();
1010   int scale  = align_to_ref_p.scale_in_bytes();
1011   int elt_size = align_to_ref_p.memory_size();
1012   int vw       = get_vw_bytes_special(mem_ref);
1013   assert(vw > 1, "sanity");
1014   int iv_adjustment;
1015   if (scale != 0) {
1016     int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
1017     // At least one iteration is executed in pre-loop by default. As result
1018     // several iterations are needed to align memory operations in main-loop even
1019     // if offset is 0.
1020     int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
1021     // iv_adjustment_in_bytes must be a multiple of elt_size if vector memory
1022     // references should be aligned on this platform.
1023     assert((ABS(iv_adjustment_in_bytes) % elt_size) == 0 || !vectors_should_be_aligned(),
1024            "(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size);
1025     iv_adjustment = iv_adjustment_in_bytes/elt_size;
1026   } else {
1027     // This memory op is not dependent on iv (scale == 0)
1028     iv_adjustment = 0;
1029   }
1030 
1031 #ifndef PRODUCT
1032   if (TraceSuperWord) {
1033     tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
1034       mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
1035     mem_ref->dump();
1036   }
1037 #endif
1038   return iv_adjustment;
1039 }
1040 
1041 //---------------------------dependence_graph---------------------------
1042 // Construct dependency graph.
1043 // Add dependence edges to load/store nodes for memory dependence
1044 //    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
dependence_graph()1045 void SuperWord::dependence_graph() {
1046   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
1047   // First, assign a dependence node to each memory node
1048   for (int i = 0; i < _block.length(); i++ ) {
1049     Node *n = _block.at(i);
1050     if (n->is_Mem() || (n->is_Phi() && n->bottom_type() == Type::MEMORY)) {
1051       _dg.make_node(n);
1052     }
1053   }
1054 
1055   // For each memory slice, create the dependences
1056   for (int i = 0; i < _mem_slice_head.length(); i++) {
1057     Node* n      = _mem_slice_head.at(i);
1058     Node* n_tail = _mem_slice_tail.at(i);
1059 
1060     // Get slice in predecessor order (last is first)
1061     if (cl->is_main_loop()) {
1062       mem_slice_preds(n_tail, n, _nlist);
1063     }
1064 
1065 #ifndef PRODUCT
1066     if(TraceSuperWord && Verbose) {
1067       tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
1068       for (int j = _nlist.length() - 1; j >= 0 ; j--) {
1069         _nlist.at(j)->dump();
1070       }
1071     }
1072 #endif
1073     // Make the slice dependent on the root
1074     DepMem* slice = _dg.dep(n);
1075     _dg.make_edge(_dg.root(), slice);
1076 
1077     // Create a sink for the slice
1078     DepMem* slice_sink = _dg.make_node(NULL);
1079     _dg.make_edge(slice_sink, _dg.tail());
1080 
1081     // Now visit each pair of memory ops, creating the edges
1082     for (int j = _nlist.length() - 1; j >= 0 ; j--) {
1083       Node* s1 = _nlist.at(j);
1084 
1085       // If no dependency yet, use slice
1086       if (_dg.dep(s1)->in_cnt() == 0) {
1087         _dg.make_edge(slice, s1);
1088       }
1089       SWPointer p1(s1->as_Mem(), this, NULL, false);
1090       bool sink_dependent = true;
1091       for (int k = j - 1; k >= 0; k--) {
1092         Node* s2 = _nlist.at(k);
1093         if (s1->is_Load() && s2->is_Load())
1094           continue;
1095         SWPointer p2(s2->as_Mem(), this, NULL, false);
1096 
1097         int cmp = p1.cmp(p2);
1098         if (SuperWordRTDepCheck &&
1099             p1.base() != p2.base() && p1.valid() && p2.valid()) {
1100           // Create a runtime check to disambiguate
1101           OrderedPair pp(p1.base(), p2.base());
1102           _disjoint_ptrs.append_if_missing(pp);
1103         } else if (!SWPointer::not_equal(cmp)) {
1104           // Possibly same address
1105           _dg.make_edge(s1, s2);
1106           sink_dependent = false;
1107         }
1108       }
1109       if (sink_dependent) {
1110         _dg.make_edge(s1, slice_sink);
1111       }
1112     }
1113 
1114     if (TraceSuperWord) {
1115       tty->print_cr("\nDependence graph for slice: %d", n->_idx);
1116       for (int q = 0; q < _nlist.length(); q++) {
1117         _dg.print(_nlist.at(q));
1118       }
1119       tty->cr();
1120     }
1121 
1122     _nlist.clear();
1123   }
1124 
1125   if (TraceSuperWord) {
1126     tty->print_cr("\ndisjoint_ptrs: %s", _disjoint_ptrs.length() > 0 ? "" : "NONE");
1127     for (int r = 0; r < _disjoint_ptrs.length(); r++) {
1128       _disjoint_ptrs.at(r).print();
1129       tty->cr();
1130     }
1131     tty->cr();
1132   }
1133 
1134 }
1135 
1136 //---------------------------mem_slice_preds---------------------------
1137 // Return a memory slice (node list) in predecessor order starting at "start"
mem_slice_preds(Node * start,Node * stop,GrowableArray<Node * > & preds)1138 void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &preds) {
1139   assert(preds.length() == 0, "start empty");
1140   Node* n = start;
1141   Node* prev = NULL;
1142   while (true) {
1143     NOT_PRODUCT( if(is_trace_mem_slice()) tty->print_cr("SuperWord::mem_slice_preds: n %d", n->_idx);)
1144     assert(in_bb(n), "must be in block");
1145     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
1146       Node* out = n->fast_out(i);
1147       if (out->is_Load()) {
1148         if (in_bb(out)) {
1149           preds.push(out);
1150           if (TraceSuperWord && Verbose) {
1151             tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", out->_idx);
1152           }
1153         }
1154       } else {
1155         // FIXME
1156         if (out->is_MergeMem() && !in_bb(out)) {
1157           // Either unrolling is causing a memory edge not to disappear,
1158           // or need to run igvn.optimize() again before SLP
1159         } else if (out->is_Phi() && out->bottom_type() == Type::MEMORY && !in_bb(out)) {
1160           // Ditto.  Not sure what else to check further.
1161         } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
1162           // StoreCM has an input edge used as a precedence edge.
1163           // Maybe an issue when oop stores are vectorized.
1164         } else {
1165           assert(out == prev || prev == NULL, "no branches off of store slice");
1166         }
1167       }//else
1168     }//for
1169     if (n == stop) break;
1170     preds.push(n);
1171     if (TraceSuperWord && Verbose) {
1172       tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", n->_idx);
1173     }
1174     prev = n;
1175     assert(n->is_Mem(), "unexpected node %s", n->Name());
1176     n = n->in(MemNode::Memory);
1177   }
1178 }
1179 
1180 //------------------------------stmts_can_pack---------------------------
1181 // Can s1 and s2 be in a pack with s1 immediately preceding s2 and
1182 // s1 aligned at "align"
stmts_can_pack(Node * s1,Node * s2,int align)1183 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
1184 
1185   // Do not use superword for non-primitives
1186   BasicType bt1 = velt_basic_type(s1);
1187   BasicType bt2 = velt_basic_type(s2);
1188   if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
1189     return false;
1190   if (Matcher::max_vector_size(bt1) < 2) {
1191     return false; // No vectors for this type
1192   }
1193 
1194   if (isomorphic(s1, s2)) {
1195     if ((independent(s1, s2) && have_similar_inputs(s1, s2)) || reduction(s1, s2)) {
1196       if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
1197         if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
1198           int s1_align = alignment(s1);
1199           int s2_align = alignment(s2);
1200           if (s1_align == top_align || s1_align == align) {
1201             if (s2_align == top_align || s2_align == align + data_size(s1)) {
1202               return true;
1203             }
1204           }
1205         }
1206       }
1207     }
1208   }
1209   return false;
1210 }
1211 
1212 //------------------------------exists_at---------------------------
1213 // Does s exist in a pack at position pos?
exists_at(Node * s,uint pos)1214 bool SuperWord::exists_at(Node* s, uint pos) {
1215   for (int i = 0; i < _packset.length(); i++) {
1216     Node_List* p = _packset.at(i);
1217     if (p->at(pos) == s) {
1218       return true;
1219     }
1220   }
1221   return false;
1222 }
1223 
1224 //------------------------------are_adjacent_refs---------------------------
1225 // Is s1 immediately before s2 in memory?
are_adjacent_refs(Node * s1,Node * s2)1226 bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
1227   if (!s1->is_Mem() || !s2->is_Mem()) return false;
1228   if (!in_bb(s1)    || !in_bb(s2))    return false;
1229 
1230   // Do not use superword for non-primitives
1231   if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
1232       !is_java_primitive(s2->as_Mem()->memory_type())) {
1233     return false;
1234   }
1235 
1236   // FIXME - co_locate_pack fails on Stores in different mem-slices, so
1237   // only pack memops that are in the same alias set until that's fixed.
1238   if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
1239       _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
1240     return false;
1241   SWPointer p1(s1->as_Mem(), this, NULL, false);
1242   SWPointer p2(s2->as_Mem(), this, NULL, false);
1243   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
1244   int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
1245   return diff == data_size(s1);
1246 }
1247 
1248 //------------------------------isomorphic---------------------------
1249 // Are s1 and s2 similar?
isomorphic(Node * s1,Node * s2)1250 bool SuperWord::isomorphic(Node* s1, Node* s2) {
1251   if (s1->Opcode() != s2->Opcode()) return false;
1252   if (s1->req() != s2->req()) return false;
1253   if (!same_velt_type(s1, s2)) return false;
1254   Node* s1_ctrl = s1->in(0);
1255   Node* s2_ctrl = s2->in(0);
1256   // If the control nodes are equivalent, no further checks are required to test for isomorphism.
1257   if (s1_ctrl == s2_ctrl) {
1258     return true;
1259   } else {
1260     bool s1_ctrl_inv = ((s1_ctrl == NULL) ? true : lpt()->is_invariant(s1_ctrl));
1261     bool s2_ctrl_inv = ((s2_ctrl == NULL) ? true : lpt()->is_invariant(s2_ctrl));
1262     // If the control nodes are not invariant for the loop, fail isomorphism test.
1263     if (!s1_ctrl_inv || !s2_ctrl_inv) {
1264       return false;
1265     }
1266     if(s1_ctrl != NULL && s2_ctrl != NULL) {
1267       if (s1_ctrl->is_Proj()) {
1268         s1_ctrl = s1_ctrl->in(0);
1269         assert(lpt()->is_invariant(s1_ctrl), "must be invariant");
1270       }
1271       if (s2_ctrl->is_Proj()) {
1272         s2_ctrl = s2_ctrl->in(0);
1273         assert(lpt()->is_invariant(s2_ctrl), "must be invariant");
1274       }
1275       if (!s1_ctrl->is_RangeCheck() || !s2_ctrl->is_RangeCheck()) {
1276         return false;
1277       }
1278     }
1279     // Control nodes are invariant. However, we have no way of checking whether they resolve
1280     // in an equivalent manner. But, we know that invariant range checks are guaranteed to
1281     // throw before the loop (if they would have thrown). Thus, the loop would not have been reached.
1282     // Therefore, if the control nodes for both are range checks, we accept them to be isomorphic.
1283     for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1284       Node* t1 = s1->fast_out(i);
1285       for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
1286         Node* t2 = s2->fast_out(j);
1287         if (VectorNode::is_muladds2i(t1) && VectorNode::is_muladds2i(t2)) {
1288           return true;
1289         }
1290       }
1291     }
1292   }
1293   return false;
1294 }
1295 
1296 //------------------------------independent---------------------------
1297 // Is there no data path from s1 to s2 or s2 to s1?
independent(Node * s1,Node * s2)1298 bool SuperWord::independent(Node* s1, Node* s2) {
1299   //  assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
1300   int d1 = depth(s1);
1301   int d2 = depth(s2);
1302   if (d1 == d2) return s1 != s2;
1303   Node* deep    = d1 > d2 ? s1 : s2;
1304   Node* shallow = d1 > d2 ? s2 : s1;
1305 
1306   visited_clear();
1307 
1308   return independent_path(shallow, deep);
1309 }
1310 
1311 //--------------------------have_similar_inputs-----------------------
1312 // For a node pair (s1, s2) which is isomorphic and independent,
1313 // do s1 and s2 have similar input edges?
have_similar_inputs(Node * s1,Node * s2)1314 bool SuperWord::have_similar_inputs(Node* s1, Node* s2) {
1315   // assert(isomorphic(s1, s2) == true, "check isomorphic");
1316   // assert(independent(s1, s2) == true, "check independent");
1317   if (s1->req() > 1 && !s1->is_Store() && !s1->is_Load()) {
1318     for (uint i = 1; i < s1->req(); i++) {
1319       if (s1->in(i)->Opcode() != s2->in(i)->Opcode()) return false;
1320     }
1321   }
1322   return true;
1323 }
1324 
1325 //------------------------------reduction---------------------------
1326 // Is there a data path between s1 and s2 and the nodes reductions?
reduction(Node * s1,Node * s2)1327 bool SuperWord::reduction(Node* s1, Node* s2) {
1328   bool retValue = false;
1329   int d1 = depth(s1);
1330   int d2 = depth(s2);
1331   if (d2 > d1) {
1332     if (s1->is_reduction() && s2->is_reduction()) {
1333       // This is an ordered set, so s1 should define s2
1334       for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1335         Node* t1 = s1->fast_out(i);
1336         if (t1 == s2) {
1337           // both nodes are reductions and connected
1338           retValue = true;
1339         }
1340       }
1341     }
1342   }
1343 
1344   return retValue;
1345 }
1346 
1347 //------------------------------independent_path------------------------------
1348 // Helper for independent
independent_path(Node * shallow,Node * deep,uint dp)1349 bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
1350   if (dp >= 1000) return false; // stop deep recursion
1351   visited_set(deep);
1352   int shal_depth = depth(shallow);
1353   assert(shal_depth <= depth(deep), "must be");
1354   for (DepPreds preds(deep, _dg); !preds.done(); preds.next()) {
1355     Node* pred = preds.current();
1356     if (in_bb(pred) && !visited_test(pred)) {
1357       if (shallow == pred) {
1358         return false;
1359       }
1360       if (shal_depth < depth(pred) && !independent_path(shallow, pred, dp+1)) {
1361         return false;
1362       }
1363     }
1364   }
1365   return true;
1366 }
1367 
1368 //------------------------------set_alignment---------------------------
set_alignment(Node * s1,Node * s2,int align)1369 void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
1370   set_alignment(s1, align);
1371   if (align == top_align || align == bottom_align) {
1372     set_alignment(s2, align);
1373   } else {
1374     set_alignment(s2, align + data_size(s1));
1375   }
1376 }
1377 
1378 //------------------------------data_size---------------------------
data_size(Node * s)1379 int SuperWord::data_size(Node* s) {
1380   Node* use = NULL; //test if the node is a candidate for CMoveV optimization, then return the size of CMov
1381   if (UseVectorCmov) {
1382     use = _cmovev_kit.is_Bool_candidate(s);
1383     if (use != NULL) {
1384       return data_size(use);
1385     }
1386     use = _cmovev_kit.is_CmpD_candidate(s);
1387     if (use != NULL) {
1388       return data_size(use);
1389     }
1390   }
1391 
1392   int bsize = type2aelembytes(velt_basic_type(s));
1393   assert(bsize != 0, "valid size");
1394   return bsize;
1395 }
1396 
1397 //------------------------------extend_packlist---------------------------
1398 // Extend packset by following use->def and def->use links from pack members.
extend_packlist()1399 void SuperWord::extend_packlist() {
1400   bool changed;
1401   do {
1402     packset_sort(_packset.length());
1403     changed = false;
1404     for (int i = 0; i < _packset.length(); i++) {
1405       Node_List* p = _packset.at(i);
1406       changed |= follow_use_defs(p);
1407       changed |= follow_def_uses(p);
1408     }
1409   } while (changed);
1410 
1411   if (_race_possible) {
1412     for (int i = 0; i < _packset.length(); i++) {
1413       Node_List* p = _packset.at(i);
1414       order_def_uses(p);
1415     }
1416   }
1417 
1418   if (TraceSuperWord) {
1419     tty->print_cr("\nAfter extend_packlist");
1420     print_packset();
1421   }
1422 }
1423 
1424 //------------------------------follow_use_defs---------------------------
1425 // Extend the packset by visiting operand definitions of nodes in pack p
follow_use_defs(Node_List * p)1426 bool SuperWord::follow_use_defs(Node_List* p) {
1427   assert(p->size() == 2, "just checking");
1428   Node* s1 = p->at(0);
1429   Node* s2 = p->at(1);
1430   assert(s1->req() == s2->req(), "just checking");
1431   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1432 
1433   if (s1->is_Load()) return false;
1434 
1435   int align = alignment(s1);
1436   NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: s1 %d, align %d", s1->_idx, align);)
1437   bool changed = false;
1438   int start = s1->is_Store() ? MemNode::ValueIn   : 1;
1439   int end   = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
1440   for (int j = start; j < end; j++) {
1441     Node* t1 = s1->in(j);
1442     Node* t2 = s2->in(j);
1443     if (!in_bb(t1) || !in_bb(t2))
1444       continue;
1445     if (stmts_can_pack(t1, t2, align)) {
1446       if (est_savings(t1, t2) >= 0) {
1447         Node_List* pair = new Node_List();
1448         pair->push(t1);
1449         pair->push(t2);
1450         _packset.append(pair);
1451         NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: set_alignment(%d, %d, %d)", t1->_idx, t2->_idx, align);)
1452         set_alignment(t1, t2, align);
1453         changed = true;
1454       }
1455     }
1456   }
1457   return changed;
1458 }
1459 
1460 //------------------------------follow_def_uses---------------------------
1461 // Extend the packset by visiting uses of nodes in pack p
follow_def_uses(Node_List * p)1462 bool SuperWord::follow_def_uses(Node_List* p) {
1463   bool changed = false;
1464   Node* s1 = p->at(0);
1465   Node* s2 = p->at(1);
1466   assert(p->size() == 2, "just checking");
1467   assert(s1->req() == s2->req(), "just checking");
1468   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1469 
1470   if (s1->is_Store()) return false;
1471 
1472   int align = alignment(s1);
1473   NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: s1 %d, align %d", s1->_idx, align);)
1474   int savings = -1;
1475   int num_s1_uses = 0;
1476   Node* u1 = NULL;
1477   Node* u2 = NULL;
1478   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1479     Node* t1 = s1->fast_out(i);
1480     num_s1_uses++;
1481     if (!in_bb(t1)) continue;
1482     for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
1483       Node* t2 = s2->fast_out(j);
1484       if (!in_bb(t2)) continue;
1485       if (t2->Opcode() == Op_AddI && t2 == _lp->as_CountedLoop()->incr()) continue; // don't mess with the iv
1486       if (!opnd_positions_match(s1, t1, s2, t2))
1487         continue;
1488       if (stmts_can_pack(t1, t2, align)) {
1489         int my_savings = est_savings(t1, t2);
1490         if (my_savings > savings) {
1491           savings = my_savings;
1492           u1 = t1;
1493           u2 = t2;
1494         }
1495       }
1496     }
1497   }
1498   if (num_s1_uses > 1) {
1499     _race_possible = true;
1500   }
1501   if (savings >= 0) {
1502     Node_List* pair = new Node_List();
1503     pair->push(u1);
1504     pair->push(u2);
1505     _packset.append(pair);
1506     NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: set_alignment(%d, %d, %d)", u1->_idx, u2->_idx, align);)
1507     set_alignment(u1, u2, align);
1508     changed = true;
1509   }
1510   return changed;
1511 }
1512 
1513 //------------------------------order_def_uses---------------------------
1514 // For extended packsets, ordinally arrange uses packset by major component
order_def_uses(Node_List * p)1515 void SuperWord::order_def_uses(Node_List* p) {
1516   Node* s1 = p->at(0);
1517 
1518   if (s1->is_Store()) return;
1519 
1520   // reductions are always managed beforehand
1521   if (s1->is_reduction()) return;
1522 
1523   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1524     Node* t1 = s1->fast_out(i);
1525 
1526     // Only allow operand swap on commuting operations
1527     if (!t1->is_Add() && !t1->is_Mul() && !VectorNode::is_muladds2i(t1)) {
1528       break;
1529     }
1530 
1531     // Now find t1's packset
1532     Node_List* p2 = NULL;
1533     for (int j = 0; j < _packset.length(); j++) {
1534       p2 = _packset.at(j);
1535       Node* first = p2->at(0);
1536       if (t1 == first) {
1537         break;
1538       }
1539       p2 = NULL;
1540     }
1541     // Arrange all sub components by the major component
1542     if (p2 != NULL) {
1543       for (uint j = 1; j < p->size(); j++) {
1544         Node* d1 = p->at(j);
1545         Node* u1 = p2->at(j);
1546         opnd_positions_match(s1, t1, d1, u1);
1547       }
1548     }
1549   }
1550 }
1551 
1552 //---------------------------opnd_positions_match-------------------------
1553 // Is the use of d1 in u1 at the same operand position as d2 in u2?
opnd_positions_match(Node * d1,Node * u1,Node * d2,Node * u2)1554 bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
1555   // check reductions to see if they are marshalled to represent the reduction
1556   // operator in a specified opnd
1557   if (u1->is_reduction() && u2->is_reduction()) {
1558     // ensure reductions have phis and reduction definitions feeding the 1st operand
1559     Node* first = u1->in(2);
1560     if (first->is_Phi() || first->is_reduction()) {
1561       u1->swap_edges(1, 2);
1562     }
1563     // ensure reductions have phis and reduction definitions feeding the 1st operand
1564     first = u2->in(2);
1565     if (first->is_Phi() || first->is_reduction()) {
1566       u2->swap_edges(1, 2);
1567     }
1568     return true;
1569   }
1570 
1571   uint ct = u1->req();
1572   if (ct != u2->req()) return false;
1573   uint i1 = 0;
1574   uint i2 = 0;
1575   do {
1576     for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
1577     for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
1578     if (i1 != i2) {
1579       if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
1580         // Further analysis relies on operands position matching.
1581         u2->swap_edges(i1, i2);
1582       } else if (VectorNode::is_muladds2i(u2) && u1 != u2) {
1583         if (i1 == 5 - i2) { // ((i1 == 3 && i2 == 2) || (i1 == 2 && i2 == 3) || (i1 == 1 && i2 == 4) || (i1 == 4 && i2 == 1))
1584           u2->swap_edges(1, 2);
1585           u2->swap_edges(3, 4);
1586         }
1587         if (i1 == 3 - i2 || i1 == 7 - i2) { // ((i1 == 1 && i2 == 2) || (i1 == 2 && i2 == 1) || (i1 == 3 && i2 == 4) || (i1 == 4 && i2 == 3))
1588           u2->swap_edges(2, 3);
1589           u2->swap_edges(1, 4);
1590         }
1591         return false; // Just swap the edges, the muladds2i nodes get packed in follow_use_defs
1592       } else {
1593         return false;
1594       }
1595     } else if (i1 == i2 && VectorNode::is_muladds2i(u2) && u1 != u2) {
1596       u2->swap_edges(1, 3);
1597       u2->swap_edges(2, 4);
1598       return false; // Just swap the edges, the muladds2i nodes get packed in follow_use_defs
1599     }
1600   } while (i1 < ct);
1601   return true;
1602 }
1603 
1604 //------------------------------est_savings---------------------------
1605 // Estimate the savings from executing s1 and s2 as a pack
est_savings(Node * s1,Node * s2)1606 int SuperWord::est_savings(Node* s1, Node* s2) {
1607   int save_in = 2 - 1; // 2 operations per instruction in packed form
1608 
1609   // inputs
1610   for (uint i = 1; i < s1->req(); i++) {
1611     Node* x1 = s1->in(i);
1612     Node* x2 = s2->in(i);
1613     if (x1 != x2) {
1614       if (are_adjacent_refs(x1, x2)) {
1615         save_in += adjacent_profit(x1, x2);
1616       } else if (!in_packset(x1, x2)) {
1617         save_in -= pack_cost(2);
1618       } else {
1619         save_in += unpack_cost(2);
1620       }
1621     }
1622   }
1623 
1624   // uses of result
1625   uint ct = 0;
1626   int save_use = 0;
1627   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1628     Node* s1_use = s1->fast_out(i);
1629     for (int j = 0; j < _packset.length(); j++) {
1630       Node_List* p = _packset.at(j);
1631       if (p->at(0) == s1_use) {
1632         for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
1633           Node* s2_use = s2->fast_out(k);
1634           if (p->at(p->size()-1) == s2_use) {
1635             ct++;
1636             if (are_adjacent_refs(s1_use, s2_use)) {
1637               save_use += adjacent_profit(s1_use, s2_use);
1638             }
1639           }
1640         }
1641       }
1642     }
1643   }
1644 
1645   if (ct < s1->outcnt()) save_use += unpack_cost(1);
1646   if (ct < s2->outcnt()) save_use += unpack_cost(1);
1647 
1648   return MAX2(save_in, save_use);
1649 }
1650 
1651 //------------------------------costs---------------------------
adjacent_profit(Node * s1,Node * s2)1652 int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; }
pack_cost(int ct)1653 int SuperWord::pack_cost(int ct)   { return ct; }
unpack_cost(int ct)1654 int SuperWord::unpack_cost(int ct) { return ct; }
1655 
1656 //------------------------------combine_packs---------------------------
1657 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
combine_packs()1658 void SuperWord::combine_packs() {
1659   bool changed = true;
1660   // Combine packs regardless max vector size.
1661   while (changed) {
1662     changed = false;
1663     for (int i = 0; i < _packset.length(); i++) {
1664       Node_List* p1 = _packset.at(i);
1665       if (p1 == NULL) continue;
1666       // Because of sorting we can start at i + 1
1667       for (int j = i + 1; j < _packset.length(); j++) {
1668         Node_List* p2 = _packset.at(j);
1669         if (p2 == NULL) continue;
1670         if (i == j) continue;
1671         if (p1->at(p1->size()-1) == p2->at(0)) {
1672           for (uint k = 1; k < p2->size(); k++) {
1673             p1->push(p2->at(k));
1674           }
1675           _packset.at_put(j, NULL);
1676           changed = true;
1677         }
1678       }
1679     }
1680   }
1681 
1682   // Split packs which have size greater then max vector size.
1683   for (int i = 0; i < _packset.length(); i++) {
1684     Node_List* p1 = _packset.at(i);
1685     if (p1 != NULL) {
1686       BasicType bt = velt_basic_type(p1->at(0));
1687       uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
1688       assert(is_power_of_2(max_vlen), "sanity");
1689       uint psize = p1->size();
1690       if (!is_power_of_2(psize)) {
1691         // Skip pack which can't be vector.
1692         // case1: for(...) { a[i] = i; }    elements values are different (i+x)
1693         // case2: for(...) { a[i] = b[i+1]; }  can't align both, load and store
1694         _packset.at_put(i, NULL);
1695         continue;
1696       }
1697       if (psize > max_vlen) {
1698         Node_List* pack = new Node_List();
1699         for (uint j = 0; j < psize; j++) {
1700           pack->push(p1->at(j));
1701           if (pack->size() >= max_vlen) {
1702             assert(is_power_of_2(pack->size()), "sanity");
1703             _packset.append(pack);
1704             pack = new Node_List();
1705           }
1706         }
1707         _packset.at_put(i, NULL);
1708       }
1709     }
1710   }
1711 
1712   // Compress list.
1713   for (int i = _packset.length() - 1; i >= 0; i--) {
1714     Node_List* p1 = _packset.at(i);
1715     if (p1 == NULL) {
1716       _packset.remove_at(i);
1717     }
1718   }
1719 
1720   if (TraceSuperWord) {
1721     tty->print_cr("\nAfter combine_packs");
1722     print_packset();
1723   }
1724 }
1725 
1726 //-----------------------------construct_my_pack_map--------------------------
1727 // Construct the map from nodes to packs.  Only valid after the
1728 // point where a node is only in one pack (after combine_packs).
construct_my_pack_map()1729 void SuperWord::construct_my_pack_map() {
1730   Node_List* rslt = NULL;
1731   for (int i = 0; i < _packset.length(); i++) {
1732     Node_List* p = _packset.at(i);
1733     for (uint j = 0; j < p->size(); j++) {
1734       Node* s = p->at(j);
1735 #ifdef ASSERT
1736       if (my_pack(s) != NULL) {
1737         s->dump(1);
1738         tty->print_cr("packs[%d]:", i);
1739         print_pack(p);
1740         assert(false, "only in one pack");
1741       }
1742 #endif
1743       set_my_pack(s, p);
1744     }
1745   }
1746 }
1747 
1748 //------------------------------filter_packs---------------------------
1749 // Remove packs that are not implemented or not profitable.
filter_packs()1750 void SuperWord::filter_packs() {
1751   // Remove packs that are not implemented
1752   for (int i = _packset.length() - 1; i >= 0; i--) {
1753     Node_List* pk = _packset.at(i);
1754     bool impl = implemented(pk);
1755     if (!impl) {
1756 #ifndef PRODUCT
1757       if ((TraceSuperWord && Verbose) || _vector_loop_debug) {
1758         tty->print_cr("Unimplemented");
1759         pk->at(0)->dump();
1760       }
1761 #endif
1762       remove_pack_at(i);
1763     }
1764     Node *n = pk->at(0);
1765     if (n->is_reduction()) {
1766       _num_reductions++;
1767     } else {
1768       _num_work_vecs++;
1769     }
1770   }
1771 
1772   // Remove packs that are not profitable
1773   bool changed;
1774   do {
1775     changed = false;
1776     for (int i = _packset.length() - 1; i >= 0; i--) {
1777       Node_List* pk = _packset.at(i);
1778       bool prof = profitable(pk);
1779       if (!prof) {
1780 #ifndef PRODUCT
1781         if ((TraceSuperWord && Verbose) || _vector_loop_debug) {
1782           tty->print_cr("Unprofitable");
1783           pk->at(0)->dump();
1784         }
1785 #endif
1786         remove_pack_at(i);
1787         changed = true;
1788       }
1789     }
1790   } while (changed);
1791 
1792 #ifndef PRODUCT
1793   if (TraceSuperWord) {
1794     tty->print_cr("\nAfter filter_packs");
1795     print_packset();
1796     tty->cr();
1797   }
1798 #endif
1799 }
1800 
1801 //------------------------------merge_packs_to_cmovd---------------------------
1802 // Merge CMoveD into new vector-nodes
1803 // We want to catch this pattern and subsume CmpD and Bool into CMoveD
1804 //
1805 //                   SubD             ConD
1806 //                  /  |               /
1807 //                 /   |           /   /
1808 //                /    |       /      /
1809 //               /     |   /         /
1810 //              /      /            /
1811 //             /    /  |           /
1812 //            v /      |          /
1813 //         CmpD        |         /
1814 //          |          |        /
1815 //          v          |       /
1816 //         Bool        |      /
1817 //           \         |     /
1818 //             \       |    /
1819 //               \     |   /
1820 //                 \   |  /
1821 //                   \ v /
1822 //                   CMoveD
1823 //
1824 
merge_packs_to_cmovd()1825 void SuperWord::merge_packs_to_cmovd() {
1826   for (int i = _packset.length() - 1; i >= 0; i--) {
1827     _cmovev_kit.make_cmovevd_pack(_packset.at(i));
1828   }
1829   #ifndef PRODUCT
1830     if (TraceSuperWord) {
1831       tty->print_cr("\nSuperWord::merge_packs_to_cmovd(): After merge");
1832       print_packset();
1833       tty->cr();
1834     }
1835   #endif
1836 }
1837 
is_Bool_candidate(Node * def) const1838 Node* CMoveKit::is_Bool_candidate(Node* def) const {
1839   Node* use = NULL;
1840   if (!def->is_Bool() || def->in(0) != NULL || def->outcnt() != 1) {
1841     return NULL;
1842   }
1843   for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1844     use = def->fast_out(j);
1845     if (!_sw->same_generation(def, use) || !use->is_CMove()) {
1846       return NULL;
1847     }
1848   }
1849   return use;
1850 }
1851 
is_CmpD_candidate(Node * def) const1852 Node* CMoveKit::is_CmpD_candidate(Node* def) const {
1853   Node* use = NULL;
1854   if (!def->is_Cmp() || def->in(0) != NULL || def->outcnt() != 1) {
1855     return NULL;
1856   }
1857   for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1858     use = def->fast_out(j);
1859     if (!_sw->same_generation(def, use) || (use = is_Bool_candidate(use)) == NULL || !_sw->same_generation(def, use)) {
1860       return NULL;
1861     }
1862   }
1863   return use;
1864 }
1865 
make_cmovevd_pack(Node_List * cmovd_pk)1866 Node_List* CMoveKit::make_cmovevd_pack(Node_List* cmovd_pk) {
1867   Node *cmovd = cmovd_pk->at(0);
1868   if (!cmovd->is_CMove()) {
1869     return NULL;
1870   }
1871   if (cmovd->Opcode() != Op_CMoveF && cmovd->Opcode() != Op_CMoveD) {
1872     return NULL;
1873   }
1874   if (pack(cmovd) != NULL) { // already in the cmov pack
1875     return NULL;
1876   }
1877   if (cmovd->in(0) != NULL) {
1878     NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CMoveD %d has control flow, escaping...", cmovd->_idx); cmovd->dump();})
1879     return NULL;
1880   }
1881 
1882   Node* bol = cmovd->as_CMove()->in(CMoveNode::Condition);
1883   if (!bol->is_Bool()
1884       || bol->outcnt() != 1
1885       || !_sw->same_generation(bol, cmovd)
1886       || bol->in(0) != NULL  // BoolNode has control flow!!
1887       || _sw->my_pack(bol) == NULL) {
1888       NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: Bool %d does not fit CMoveD %d for building vector, escaping...", bol->_idx, cmovd->_idx); bol->dump();})
1889       return NULL;
1890   }
1891   Node_List* bool_pk = _sw->my_pack(bol);
1892   if (bool_pk->size() != cmovd_pk->size() ) {
1893     return NULL;
1894   }
1895 
1896   Node* cmpd = bol->in(1);
1897   if (!cmpd->is_Cmp()
1898       || cmpd->outcnt() != 1
1899       || !_sw->same_generation(cmpd, cmovd)
1900       || cmpd->in(0) != NULL  // CmpDNode has control flow!!
1901       || _sw->my_pack(cmpd) == NULL) {
1902       NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CmpD %d does not fit CMoveD %d for building vector, escaping...", cmpd->_idx, cmovd->_idx); cmpd->dump();})
1903       return NULL;
1904   }
1905   Node_List* cmpd_pk = _sw->my_pack(cmpd);
1906   if (cmpd_pk->size() != cmovd_pk->size() ) {
1907     return NULL;
1908   }
1909 
1910   if (!test_cmpd_pack(cmpd_pk, cmovd_pk)) {
1911     NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: cmpd pack for CmpD %d failed vectorization test", cmpd->_idx); cmpd->dump();})
1912     return NULL;
1913   }
1914 
1915   Node_List* new_cmpd_pk = new Node_List();
1916   uint sz = cmovd_pk->size() - 1;
1917   for (uint i = 0; i <= sz; ++i) {
1918     Node* cmov = cmovd_pk->at(i);
1919     Node* bol  = bool_pk->at(i);
1920     Node* cmp  = cmpd_pk->at(i);
1921 
1922     new_cmpd_pk->insert(i, cmov);
1923 
1924     map(cmov, new_cmpd_pk);
1925     map(bol, new_cmpd_pk);
1926     map(cmp, new_cmpd_pk);
1927 
1928     _sw->set_my_pack(cmov, new_cmpd_pk); // and keep old packs for cmp and bool
1929   }
1930   _sw->_packset.remove(cmovd_pk);
1931   _sw->_packset.remove(bool_pk);
1932   _sw->_packset.remove(cmpd_pk);
1933   _sw->_packset.append(new_cmpd_pk);
1934   NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print_cr("CMoveKit::make_cmovevd_pack: added syntactic CMoveD pack"); _sw->print_pack(new_cmpd_pk);})
1935   return new_cmpd_pk;
1936 }
1937 
test_cmpd_pack(Node_List * cmpd_pk,Node_List * cmovd_pk)1938 bool CMoveKit::test_cmpd_pack(Node_List* cmpd_pk, Node_List* cmovd_pk) {
1939   Node* cmpd0 = cmpd_pk->at(0);
1940   assert(cmpd0->is_Cmp(), "CMoveKit::test_cmpd_pack: should be CmpDNode");
1941   assert(cmovd_pk->at(0)->is_CMove(), "CMoveKit::test_cmpd_pack: should be CMoveD");
1942   assert(cmpd_pk->size() == cmovd_pk->size(), "CMoveKit::test_cmpd_pack: should be same size");
1943   Node* in1 = cmpd0->in(1);
1944   Node* in2 = cmpd0->in(2);
1945   Node_List* in1_pk = _sw->my_pack(in1);
1946   Node_List* in2_pk = _sw->my_pack(in2);
1947 
1948   if (  (in1_pk != NULL && in1_pk->size() != cmpd_pk->size())
1949      || (in2_pk != NULL && in2_pk->size() != cmpd_pk->size()) ) {
1950     return false;
1951   }
1952 
1953   // test if "all" in1 are in the same pack or the same node
1954   if (in1_pk == NULL) {
1955     for (uint j = 1; j < cmpd_pk->size(); j++) {
1956       if (cmpd_pk->at(j)->in(1) != in1) {
1957         return false;
1958       }
1959     }//for: in1_pk is not pack but all CmpD nodes in the pack have the same in(1)
1960   }
1961   // test if "all" in2 are in the same pack or the same node
1962   if (in2_pk == NULL) {
1963     for (uint j = 1; j < cmpd_pk->size(); j++) {
1964       if (cmpd_pk->at(j)->in(2) != in2) {
1965         return false;
1966       }
1967     }//for: in2_pk is not pack but all CmpD nodes in the pack have the same in(2)
1968   }
1969   //now check if cmpd_pk may be subsumed in vector built for cmovd_pk
1970   int cmovd_ind1, cmovd_ind2;
1971   if (cmpd_pk->at(0)->in(1) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
1972    && cmpd_pk->at(0)->in(2) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
1973       cmovd_ind1 = CMoveNode::IfFalse;
1974       cmovd_ind2 = CMoveNode::IfTrue;
1975   } else if (cmpd_pk->at(0)->in(2) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
1976           && cmpd_pk->at(0)->in(1) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
1977       cmovd_ind2 = CMoveNode::IfFalse;
1978       cmovd_ind1 = CMoveNode::IfTrue;
1979   }
1980   else {
1981     return false;
1982   }
1983 
1984   for (uint j = 1; j < cmpd_pk->size(); j++) {
1985     if (cmpd_pk->at(j)->in(1) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind1)
1986         || cmpd_pk->at(j)->in(2) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind2)) {
1987         return false;
1988     }//if
1989   }
1990   NOT_PRODUCT(if(_sw->is_trace_cmov()) { tty->print("CMoveKit::test_cmpd_pack: cmpd pack for 1st CmpD %d is OK for vectorization: ", cmpd0->_idx); cmpd0->dump(); })
1991   return true;
1992 }
1993 
1994 //------------------------------implemented---------------------------
1995 // Can code be generated for pack p?
implemented(Node_List * p)1996 bool SuperWord::implemented(Node_List* p) {
1997   bool retValue = false;
1998   Node* p0 = p->at(0);
1999   if (p0 != NULL) {
2000     int opc = p0->Opcode();
2001     uint size = p->size();
2002     if (p0->is_reduction()) {
2003       const Type *arith_type = p0->bottom_type();
2004       // Length 2 reductions of INT/LONG do not offer performance benefits
2005       if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
2006         retValue = false;
2007       } else {
2008         retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
2009       }
2010     } else {
2011       retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
2012     }
2013     if (!retValue) {
2014       if (is_cmov_pack(p)) {
2015         NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::implemented: found cmpd pack"); print_pack(p);})
2016         return true;
2017       }
2018     }
2019   }
2020   return retValue;
2021 }
2022 
is_cmov_pack(Node_List * p)2023 bool SuperWord::is_cmov_pack(Node_List* p) {
2024   return _cmovev_kit.pack(p->at(0)) != NULL;
2025 }
2026 //------------------------------same_inputs--------------------------
2027 // For pack p, are all idx operands the same?
same_inputs(Node_List * p,int idx)2028 bool SuperWord::same_inputs(Node_List* p, int idx) {
2029   Node* p0 = p->at(0);
2030   uint vlen = p->size();
2031   Node* p0_def = p0->in(idx);
2032   for (uint i = 1; i < vlen; i++) {
2033     Node* pi = p->at(i);
2034     Node* pi_def = pi->in(idx);
2035     if (p0_def != pi_def) {
2036       return false;
2037     }
2038   }
2039   return true;
2040 }
2041 
2042 //------------------------------profitable---------------------------
2043 // For pack p, are all operands and all uses (with in the block) vector?
profitable(Node_List * p)2044 bool SuperWord::profitable(Node_List* p) {
2045   Node* p0 = p->at(0);
2046   uint start, end;
2047   VectorNode::vector_operands(p0, &start, &end);
2048 
2049   // Return false if some inputs are not vectors or vectors with different
2050   // size or alignment.
2051   // Also, for now, return false if not scalar promotion case when inputs are
2052   // the same. Later, implement PackNode and allow differing, non-vector inputs
2053   // (maybe just the ones from outside the block.)
2054   for (uint i = start; i < end; i++) {
2055     if (!is_vector_use(p0, i)) {
2056       return false;
2057     }
2058   }
2059   // Check if reductions are connected
2060   if (p0->is_reduction()) {
2061     Node* second_in = p0->in(2);
2062     Node_List* second_pk = my_pack(second_in);
2063     if ((second_pk == NULL) || (_num_work_vecs == _num_reductions)) {
2064       // Remove reduction flag if no parent pack or if not enough work
2065       // to cover reduction expansion overhead
2066       p0->remove_flag(Node::Flag_is_reduction);
2067       return false;
2068     } else if (second_pk->size() != p->size()) {
2069       return false;
2070     }
2071   }
2072   if (VectorNode::is_shift(p0)) {
2073     // For now, return false if shift count is vector or not scalar promotion
2074     // case (different shift counts) because it is not supported yet.
2075     Node* cnt = p0->in(2);
2076     Node_List* cnt_pk = my_pack(cnt);
2077     if (cnt_pk != NULL)
2078       return false;
2079     if (!same_inputs(p, 2))
2080       return false;
2081   }
2082   if (!p0->is_Store()) {
2083     // For now, return false if not all uses are vector.
2084     // Later, implement ExtractNode and allow non-vector uses (maybe
2085     // just the ones outside the block.)
2086     for (uint i = 0; i < p->size(); i++) {
2087       Node* def = p->at(i);
2088       if (is_cmov_pack_internal_node(p, def)) {
2089         continue;
2090       }
2091       for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
2092         Node* use = def->fast_out(j);
2093         for (uint k = 0; k < use->req(); k++) {
2094           Node* n = use->in(k);
2095           if (def == n) {
2096             // Reductions should only have a Phi use at the loop head or a non-phi use
2097             // outside of the loop if it is the last element of the pack (e.g. SafePoint).
2098             if (def->is_reduction() &&
2099                 ((use->is_Phi() && use->in(0) == _lpt->_head) ||
2100                  (!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))) && i == p->size()-1))) {
2101               continue;
2102             }
2103             if (!is_vector_use(use, k)) {
2104               return false;
2105             }
2106           }
2107         }
2108       }
2109     }
2110   }
2111   return true;
2112 }
2113 
2114 //------------------------------schedule---------------------------
2115 // Adjust the memory graph for the packed operations
schedule()2116 void SuperWord::schedule() {
2117 
2118   // Co-locate in the memory graph the members of each memory pack
2119   for (int i = 0; i < _packset.length(); i++) {
2120     co_locate_pack(_packset.at(i));
2121   }
2122 }
2123 
2124 //-------------------------------remove_and_insert-------------------
2125 // Remove "current" from its current position in the memory graph and insert
2126 // it after the appropriate insertion point (lip or uip).
remove_and_insert(MemNode * current,MemNode * prev,MemNode * lip,Node * uip,Unique_Node_List & sched_before)2127 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
2128                                   Node *uip, Unique_Node_List &sched_before) {
2129   Node* my_mem = current->in(MemNode::Memory);
2130   bool sched_up = sched_before.member(current);
2131 
2132   // remove current_store from its current position in the memmory graph
2133   for (DUIterator i = current->outs(); current->has_out(i); i++) {
2134     Node* use = current->out(i);
2135     if (use->is_Mem()) {
2136       assert(use->in(MemNode::Memory) == current, "must be");
2137       if (use == prev) { // connect prev to my_mem
2138           _igvn.replace_input_of(use, MemNode::Memory, my_mem);
2139           --i; //deleted this edge; rescan position
2140       } else if (sched_before.member(use)) {
2141         if (!sched_up) { // Will be moved together with current
2142           _igvn.replace_input_of(use, MemNode::Memory, uip);
2143           --i; //deleted this edge; rescan position
2144         }
2145       } else {
2146         if (sched_up) { // Will be moved together with current
2147           _igvn.replace_input_of(use, MemNode::Memory, lip);
2148           --i; //deleted this edge; rescan position
2149         }
2150       }
2151     }
2152   }
2153 
2154   Node *insert_pt =  sched_up ?  uip : lip;
2155 
2156   // all uses of insert_pt's memory state should use current's instead
2157   for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
2158     Node* use = insert_pt->out(i);
2159     if (use->is_Mem()) {
2160       assert(use->in(MemNode::Memory) == insert_pt, "must be");
2161       _igvn.replace_input_of(use, MemNode::Memory, current);
2162       --i; //deleted this edge; rescan position
2163     } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
2164       uint pos; //lip (lower insert point) must be the last one in the memory slice
2165       for (pos=1; pos < use->req(); pos++) {
2166         if (use->in(pos) == insert_pt) break;
2167       }
2168       _igvn.replace_input_of(use, pos, current);
2169       --i;
2170     }
2171   }
2172 
2173   //connect current to insert_pt
2174   _igvn.replace_input_of(current, MemNode::Memory, insert_pt);
2175 }
2176 
2177 //------------------------------co_locate_pack----------------------------------
2178 // To schedule a store pack, we need to move any sandwiched memory ops either before
2179 // or after the pack, based upon dependence information:
2180 // (1) If any store in the pack depends on the sandwiched memory op, the
2181 //     sandwiched memory op must be scheduled BEFORE the pack;
2182 // (2) If a sandwiched memory op depends on any store in the pack, the
2183 //     sandwiched memory op must be scheduled AFTER the pack;
2184 // (3) If a sandwiched memory op (say, memA) depends on another sandwiched
2185 //     memory op (say memB), memB must be scheduled before memA. So, if memA is
2186 //     scheduled before the pack, memB must also be scheduled before the pack;
2187 // (4) If there is no dependence restriction for a sandwiched memory op, we simply
2188 //     schedule this store AFTER the pack
2189 // (5) We know there is no dependence cycle, so there in no other case;
2190 // (6) Finally, all memory ops in another single pack should be moved in the same direction.
2191 //
2192 // To schedule a load pack, we use the memory state of either the first or the last load in
2193 // the pack, based on the dependence constraint.
co_locate_pack(Node_List * pk)2194 void SuperWord::co_locate_pack(Node_List* pk) {
2195   if (pk->at(0)->is_Store()) {
2196     MemNode* first     = executed_first(pk)->as_Mem();
2197     MemNode* last      = executed_last(pk)->as_Mem();
2198     Unique_Node_List schedule_before_pack;
2199     Unique_Node_List memops;
2200 
2201     MemNode* current   = last->in(MemNode::Memory)->as_Mem();
2202     MemNode* previous  = last;
2203     while (true) {
2204       assert(in_bb(current), "stay in block");
2205       memops.push(previous);
2206       for (DUIterator i = current->outs(); current->has_out(i); i++) {
2207         Node* use = current->out(i);
2208         if (use->is_Mem() && use != previous)
2209           memops.push(use);
2210       }
2211       if (current == first) break;
2212       previous = current;
2213       current  = current->in(MemNode::Memory)->as_Mem();
2214     }
2215 
2216     // determine which memory operations should be scheduled before the pack
2217     for (uint i = 1; i < memops.size(); i++) {
2218       Node *s1 = memops.at(i);
2219       if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
2220         for (uint j = 0; j< i; j++) {
2221           Node *s2 = memops.at(j);
2222           if (!independent(s1, s2)) {
2223             if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
2224               schedule_before_pack.push(s1); // s1 must be scheduled before
2225               Node_List* mem_pk = my_pack(s1);
2226               if (mem_pk != NULL) {
2227                 for (uint ii = 0; ii < mem_pk->size(); ii++) {
2228                   Node* s = mem_pk->at(ii);  // follow partner
2229                   if (memops.member(s) && !schedule_before_pack.member(s))
2230                     schedule_before_pack.push(s);
2231                 }
2232               }
2233               break;
2234             }
2235           }
2236         }
2237       }
2238     }
2239 
2240     Node*    upper_insert_pt = first->in(MemNode::Memory);
2241     // Following code moves loads connected to upper_insert_pt below aliased stores.
2242     // Collect such loads here and reconnect them back to upper_insert_pt later.
2243     memops.clear();
2244     for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
2245       Node* use = upper_insert_pt->out(i);
2246       if (use->is_Mem() && !use->is_Store()) {
2247         memops.push(use);
2248       }
2249     }
2250 
2251     MemNode* lower_insert_pt = last;
2252     previous                 = last; //previous store in pk
2253     current                  = last->in(MemNode::Memory)->as_Mem();
2254 
2255     // start scheduling from "last" to "first"
2256     while (true) {
2257       assert(in_bb(current), "stay in block");
2258       assert(in_pack(previous, pk), "previous stays in pack");
2259       Node* my_mem = current->in(MemNode::Memory);
2260 
2261       if (in_pack(current, pk)) {
2262         // Forward users of my memory state (except "previous) to my input memory state
2263         for (DUIterator i = current->outs(); current->has_out(i); i++) {
2264           Node* use = current->out(i);
2265           if (use->is_Mem() && use != previous) {
2266             assert(use->in(MemNode::Memory) == current, "must be");
2267             if (schedule_before_pack.member(use)) {
2268               _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
2269             } else {
2270               _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
2271             }
2272             --i; // deleted this edge; rescan position
2273           }
2274         }
2275         previous = current;
2276       } else { // !in_pack(current, pk) ==> a sandwiched store
2277         remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack);
2278       }
2279 
2280       if (current == first) break;
2281       current = my_mem->as_Mem();
2282     } // end while
2283 
2284     // Reconnect loads back to upper_insert_pt.
2285     for (uint i = 0; i < memops.size(); i++) {
2286       Node *ld = memops.at(i);
2287       if (ld->in(MemNode::Memory) != upper_insert_pt) {
2288         _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
2289       }
2290     }
2291   } else if (pk->at(0)->is_Load()) { // Load pack
2292     // All loads in the pack should have the same memory state. By default,
2293     // we use the memory state of the last load. However, if any load could
2294     // not be moved down due to the dependence constraint, we use the memory
2295     // state of the first load.
2296     Node* mem_input = pick_mem_state(pk);
2297     _igvn.hash_delete(mem_input);
2298     // Give each load the same memory state
2299     for (uint i = 0; i < pk->size(); i++) {
2300       LoadNode* ld = pk->at(i)->as_Load();
2301       _igvn.replace_input_of(ld, MemNode::Memory, mem_input);
2302     }
2303   }
2304 }
2305 
2306 // Finds the first and last memory state and then picks either of them by checking dependence constraints.
2307 // If a store is dependent on an earlier load then we need to pick the memory state of the first load and cannot
2308 // pick the memory state of the last load.
pick_mem_state(Node_List * pk)2309 Node* SuperWord::pick_mem_state(Node_List* pk) {
2310   Node* first_mem = find_first_mem_state(pk);
2311   Node* last_mem  = find_last_mem_state(pk, first_mem);
2312 
2313   for (uint i = 0; i < pk->size(); i++) {
2314     Node* ld = pk->at(i);
2315     for (Node* current = last_mem; current != ld->in(MemNode::Memory); current = current->in(MemNode::Memory)) {
2316       assert(current->is_Mem() && in_bb(current), "unexpected memory");
2317       assert(current != first_mem, "corrupted memory graph");
2318       if (!independent(current, ld)) {
2319 #ifdef ASSERT
2320         // Added assertion code since no case has been observed that should pick the first memory state.
2321         // Remove the assertion code whenever we find a (valid) case that really needs the first memory state.
2322         pk->dump();
2323         first_mem->dump();
2324         last_mem->dump();
2325         current->dump();
2326         ld->dump();
2327         ld->in(MemNode::Memory)->dump();
2328         assert(false, "never observed that first memory should be picked");
2329 #endif
2330         return first_mem; // A later store depends on this load, pick memory state of first load
2331       }
2332     }
2333   }
2334   return last_mem;
2335 }
2336 
2337 // Walk the memory graph from the current first load until the
2338 // start of the loop and check if nodes on the way are memory
2339 // edges of loads in the pack. The last one we encounter is the
2340 // first load.
find_first_mem_state(Node_List * pk)2341 Node* SuperWord::find_first_mem_state(Node_List* pk) {
2342   Node* first_mem = pk->at(0)->in(MemNode::Memory);
2343   for (Node* current = first_mem; in_bb(current); current = current->is_Phi() ? current->in(LoopNode::EntryControl) : current->in(MemNode::Memory)) {
2344     assert(current->is_Mem() || (current->is_Phi() && current->in(0) == bb()), "unexpected memory");
2345     for (uint i = 1; i < pk->size(); i++) {
2346       Node* ld = pk->at(i);
2347       if (ld->in(MemNode::Memory) == current) {
2348         first_mem = current;
2349         break;
2350       }
2351     }
2352   }
2353   return first_mem;
2354 }
2355 
2356 // Find the last load by going over the pack again and walking
2357 // the memory graph from the loads of the pack to the memory of
2358 // the first load. If we encounter the memory of the current last
2359 // load, then we started from further down in the memory graph and
2360 // the load we started from is the last load.
find_last_mem_state(Node_List * pk,Node * first_mem)2361 Node* SuperWord::find_last_mem_state(Node_List* pk, Node* first_mem) {
2362   Node* last_mem = pk->at(0)->in(MemNode::Memory);
2363   for (uint i = 0; i < pk->size(); i++) {
2364     Node* ld = pk->at(i);
2365     for (Node* current = ld->in(MemNode::Memory); current != first_mem; current = current->in(MemNode::Memory)) {
2366       assert(current->is_Mem() && in_bb(current), "unexpected memory");
2367       if (current->in(MemNode::Memory) == last_mem) {
2368         last_mem = ld->in(MemNode::Memory);
2369       }
2370     }
2371   }
2372   return last_mem;
2373 }
2374 
2375 #ifndef PRODUCT
print_loop(bool whole)2376 void SuperWord::print_loop(bool whole) {
2377   Node_Stack stack(_arena, _phase->C->unique() >> 2);
2378   Node_List rpo_list;
2379   VectorSet visited(_arena);
2380   visited.set(lpt()->_head->_idx);
2381   _phase->rpo(lpt()->_head, stack, visited, rpo_list);
2382   _phase->dump(lpt(), rpo_list.size(), rpo_list );
2383   if(whole) {
2384     tty->print_cr("\n Whole loop tree");
2385     _phase->dump();
2386     tty->print_cr(" End of whole loop tree\n");
2387   }
2388 }
2389 #endif
2390 
2391 //------------------------------output---------------------------
2392 // Convert packs into vector node operations
output()2393 void SuperWord::output() {
2394   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2395   Compile* C = _phase->C;
2396   if (_packset.length() == 0) {
2397     if (cl->is_main_loop()) {
2398       // Instigate more unrolling for optimization when vectorization fails.
2399       C->set_major_progress();
2400       cl->set_notpassed_slp();
2401       cl->mark_do_unroll_only();
2402     }
2403     return;
2404   }
2405 
2406 #ifndef PRODUCT
2407   if (TraceLoopOpts) {
2408     tty->print("SuperWord::output    ");
2409     lpt()->dump_head();
2410   }
2411 #endif
2412 
2413   if (cl->is_main_loop()) {
2414     // MUST ENSURE main loop's initial value is properly aligned:
2415     //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
2416 
2417     align_initial_loop_index(align_to_ref());
2418 
2419     // Insert extract (unpack) operations for scalar uses
2420     for (int i = 0; i < _packset.length(); i++) {
2421       insert_extracts(_packset.at(i));
2422     }
2423   }
2424 
2425   uint max_vlen_in_bytes = 0;
2426   uint max_vlen = 0;
2427   bool can_process_post_loop = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
2428 
2429   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
2430 
2431   CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
2432 
2433   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop after create_reserve_version_of_loop"); print_loop(true);})
2434 
2435   if (do_reserve_copy() && !make_reversable.has_reserved()) {
2436     NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: loop was not reserved correctly, exiting SuperWord");})
2437     return;
2438   }
2439 
2440   for (int i = 0; i < _block.length(); i++) {
2441     Node* n = _block.at(i);
2442     Node_List* p = my_pack(n);
2443     if (p && n == executed_last(p)) {
2444       uint vlen = p->size();
2445       uint vlen_in_bytes = 0;
2446       Node* vn = NULL;
2447       Node* low_adr = p->at(0);
2448       Node* first   = executed_first(p);
2449       if (can_process_post_loop) {
2450         // override vlen with the main loops vector length
2451         vlen = cl->slp_max_unroll();
2452       }
2453       NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
2454       int   opc = n->Opcode();
2455       if (n->is_Load()) {
2456         Node* ctl = n->in(MemNode::Control);
2457         Node* mem = first->in(MemNode::Memory);
2458         SWPointer p1(n->as_Mem(), this, NULL, false);
2459         // Identify the memory dependency for the new loadVector node by
2460         // walking up through memory chain.
2461         // This is done to give flexibility to the new loadVector node so that
2462         // it can move above independent storeVector nodes.
2463         while (mem->is_StoreVector()) {
2464           SWPointer p2(mem->as_Mem(), this, NULL, false);
2465           int cmp = p1.cmp(p2);
2466           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
2467             mem = mem->in(MemNode::Memory);
2468           } else {
2469             break; // dependent memory
2470           }
2471         }
2472         Node* adr = low_adr->in(MemNode::Address);
2473         const TypePtr* atyp = n->adr_type();
2474         vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
2475         vlen_in_bytes = vn->as_LoadVector()->memory_size();
2476       } else if (n->is_Store()) {
2477         // Promote value to be stored to vector
2478         Node* val = vector_opd(p, MemNode::ValueIn);
2479         if (val == NULL) {
2480           if (do_reserve_copy()) {
2481             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: val should not be NULL, exiting SuperWord");})
2482             return; //and reverse to backup IG
2483           }
2484           ShouldNotReachHere();
2485         }
2486 
2487         Node* ctl = n->in(MemNode::Control);
2488         Node* mem = first->in(MemNode::Memory);
2489         Node* adr = low_adr->in(MemNode::Address);
2490         const TypePtr* atyp = n->adr_type();
2491         vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
2492         vlen_in_bytes = vn->as_StoreVector()->memory_size();
2493       } else if (VectorNode::is_scalar_rotate(n)) {
2494         Node* in1 = low_adr->in(1);
2495         Node* in2 = p->at(0)->in(2);
2496         assert(in2->bottom_type()->isa_int(), "Shift must always be an int value");
2497         // If rotation count is non-constant or greater than 8bit value create a vector.
2498         if (!in2->is_Con() || -0x80 > in2->get_int() || in2->get_int() >= 0x80) {
2499           in2 =  vector_opd(p, 2);
2500         }
2501         vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2502         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2503       } else if (VectorNode::is_roundopD(n)) {
2504         Node* in1 = vector_opd(p, 1);
2505         Node* in2 = low_adr->in(2);
2506         assert(in2->is_Con(), "Constant rounding mode expected.");
2507         vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2508         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2509       } else if (VectorNode::is_muladds2i(n)) {
2510         assert(n->req() == 5u, "MulAddS2I should have 4 operands.");
2511         Node* in1 = vector_opd(p, 1);
2512         Node* in2 = vector_opd(p, 2);
2513         vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2514         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2515       } else if (n->req() == 3 && !is_cmov_pack(p)) {
2516         // Promote operands to vector
2517         Node* in1 = NULL;
2518         bool node_isa_reduction = n->is_reduction();
2519         if (node_isa_reduction) {
2520           // the input to the first reduction operation is retained
2521           in1 = low_adr->in(1);
2522         } else {
2523           in1 = vector_opd(p, 1);
2524           if (in1 == NULL) {
2525             if (do_reserve_copy()) {
2526               NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: in1 should not be NULL, exiting SuperWord");})
2527               return; //and reverse to backup IG
2528             }
2529             ShouldNotReachHere();
2530           }
2531         }
2532         Node* in2 = vector_opd(p, 2);
2533         if (in2 == NULL) {
2534           if (do_reserve_copy()) {
2535             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: in2 should not be NULL, exiting SuperWord");})
2536             return; //and reverse to backup IG
2537           }
2538           ShouldNotReachHere();
2539         }
2540         if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
2541           // Move invariant vector input into second position to avoid register spilling.
2542           Node* tmp = in1;
2543           in1 = in2;
2544           in2 = tmp;
2545         }
2546         if (node_isa_reduction) {
2547           const Type *arith_type = n->bottom_type();
2548           vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
2549           if (in2->is_Load()) {
2550             vlen_in_bytes = in2->as_LoadVector()->memory_size();
2551           } else {
2552             vlen_in_bytes = in2->as_Vector()->length_in_bytes();
2553           }
2554         } else {
2555           vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2556           vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2557         }
2558       } else if (opc == Op_SqrtF || opc == Op_SqrtD ||
2559                  opc == Op_AbsF || opc == Op_AbsD ||
2560                  opc == Op_AbsI || opc == Op_AbsL ||
2561                  opc == Op_NegF || opc == Op_NegD ||
2562                  opc == Op_PopCountI) {
2563         assert(n->req() == 2, "only one input expected");
2564         Node* in = vector_opd(p, 1);
2565         vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
2566         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2567       } else if (is_cmov_pack(p)) {
2568         if (can_process_post_loop) {
2569           // do not refactor of flow in post loop context
2570           return;
2571         }
2572         if (!n->is_CMove()) {
2573           continue;
2574         }
2575         // place here CMoveVDNode
2576         NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
2577         Node* bol = n->in(CMoveNode::Condition);
2578         if (!bol->is_Bool() && bol->Opcode() == Op_ExtractI && bol->req() > 1 ) {
2579           NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d is not Bool node, trying its in(1) node %d", bol->_idx, bol->in(1)->_idx); bol->dump(); bol->in(1)->dump();})
2580           bol = bol->in(1); //may be ExtractNode
2581         }
2582 
2583         assert(bol->is_Bool(), "should be BoolNode - too late to bail out!");
2584         if (!bol->is_Bool()) {
2585           if (do_reserve_copy()) {
2586             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: expected %d bool node, exiting SuperWord", bol->_idx); bol->dump();})
2587             return; //and reverse to backup IG
2588           }
2589           ShouldNotReachHere();
2590         }
2591 
2592         int cond = (int)bol->as_Bool()->_test._test;
2593         Node* in_cc  = _igvn.intcon(cond);
2594         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created intcon in_cc node %d", in_cc->_idx); in_cc->dump();})
2595         Node* cc = bol->clone();
2596         cc->set_req(1, in_cc);
2597         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created bool cc node %d", cc->_idx); cc->dump();})
2598 
2599         Node* src1 = vector_opd(p, 2); //2=CMoveNode::IfFalse
2600         if (src1 == NULL) {
2601           if (do_reserve_copy()) {
2602             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src1 should not be NULL, exiting SuperWord");})
2603             return; //and reverse to backup IG
2604           }
2605           ShouldNotReachHere();
2606         }
2607         Node* src2 = vector_opd(p, 3); //3=CMoveNode::IfTrue
2608         if (src2 == NULL) {
2609           if (do_reserve_copy()) {
2610             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src2 should not be NULL, exiting SuperWord");})
2611             return; //and reverse to backup IG
2612           }
2613           ShouldNotReachHere();
2614         }
2615         BasicType bt = velt_basic_type(n);
2616         const TypeVect* vt = TypeVect::make(bt, vlen);
2617         assert(bt == T_FLOAT || bt == T_DOUBLE, "Only vectorization for FP cmovs is supported");
2618         if (bt == T_FLOAT) {
2619           vn = new CMoveVFNode(cc, src1, src2, vt);
2620         } else {
2621           assert(bt == T_DOUBLE, "Expected double");
2622           vn = new CMoveVDNode(cc, src1, src2, vt);
2623         }
2624         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
2625       } else if (opc == Op_FmaD || opc == Op_FmaF) {
2626         // Promote operands to vector
2627         Node* in1 = vector_opd(p, 1);
2628         Node* in2 = vector_opd(p, 2);
2629         Node* in3 = vector_opd(p, 3);
2630         vn = VectorNode::make(opc, in1, in2, in3, vlen, velt_basic_type(n));
2631         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2632       } else {
2633         if (do_reserve_copy()) {
2634           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");})
2635           return; //and reverse to backup IG
2636         }
2637         ShouldNotReachHere();
2638       }
2639 
2640       assert(vn != NULL, "sanity");
2641       if (vn == NULL) {
2642         if (do_reserve_copy()){
2643           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: got NULL node, cannot proceed, exiting SuperWord");})
2644           return; //and reverse to backup IG
2645         }
2646         ShouldNotReachHere();
2647       }
2648 
2649       _block.at_put(i, vn);
2650       _igvn.register_new_node_with_optimizer(vn);
2651       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
2652       for (uint j = 0; j < p->size(); j++) {
2653         Node* pm = p->at(j);
2654         _igvn.replace_node(pm, vn);
2655       }
2656       _igvn._worklist.push(vn);
2657 
2658       if (can_process_post_loop) {
2659         // first check if the vector size if the maximum vector which we can use on the machine,
2660         // other vector size have reduced values for predicated data mapping.
2661         if (vlen_in_bytes != (uint)MaxVectorSize) {
2662           return;
2663         }
2664       }
2665 
2666       if (vlen > max_vlen) {
2667         max_vlen = vlen;
2668       }
2669       if (vlen_in_bytes > max_vlen_in_bytes) {
2670         max_vlen_in_bytes = vlen_in_bytes;
2671       }
2672 #ifdef ASSERT
2673       if (TraceNewVectors) {
2674         tty->print("new Vector node: ");
2675         vn->dump();
2676       }
2677 #endif
2678     }
2679   }//for (int i = 0; i < _block.length(); i++)
2680 
2681   if (max_vlen_in_bytes > C->max_vector_size()) {
2682     C->set_max_vector_size(max_vlen_in_bytes);
2683   }
2684   if (max_vlen_in_bytes > 0) {
2685     cl->mark_loop_vectorized();
2686   }
2687 
2688   if (SuperWordLoopUnrollAnalysis) {
2689     if (cl->has_passed_slp()) {
2690       uint slp_max_unroll_factor = cl->slp_max_unroll();
2691       if (slp_max_unroll_factor == max_vlen) {
2692         if (TraceSuperWordLoopUnrollAnalysis) {
2693           tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
2694         }
2695 
2696         // For atomic unrolled loops which are vector mapped, instigate more unrolling
2697         cl->set_notpassed_slp();
2698         if (cl->is_main_loop()) {
2699           // if vector resources are limited, do not allow additional unrolling, also
2700           // do not unroll more on pure vector loops which were not reduced so that we can
2701           // program the post loop to single iteration execution.
2702           if (FLOATPRESSURE > 8) {
2703             C->set_major_progress();
2704             cl->mark_do_unroll_only();
2705           }
2706         }
2707 
2708         if (do_reserve_copy()) {
2709           if (can_process_post_loop) {
2710             // Now create the difference of trip and limit and use it as our mask index.
2711             // Note: We limited the unroll of the vectorized loop so that
2712             //       only vlen-1 size iterations can remain to be mask programmed.
2713             Node *incr = cl->incr();
2714             SubINode *index = new SubINode(cl->limit(), cl->init_trip());
2715             _igvn.register_new_node_with_optimizer(index);
2716             SetVectMaskINode  *mask = new SetVectMaskINode(_phase->get_ctrl(cl->init_trip()), index);
2717             _igvn.register_new_node_with_optimizer(mask);
2718             // make this a single iteration loop
2719             AddINode *new_incr = new AddINode(incr->in(1), mask);
2720             _igvn.register_new_node_with_optimizer(new_incr);
2721             _phase->set_ctrl(new_incr, _phase->get_ctrl(incr));
2722             _igvn.replace_node(incr, new_incr);
2723             cl->mark_is_multiversioned();
2724             cl->loopexit()->add_flag(Node::Flag_has_vector_mask_set);
2725           }
2726         }
2727       }
2728     }
2729   }
2730 
2731   if (do_reserve_copy()) {
2732     make_reversable.use_new();
2733   }
2734   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
2735   return;
2736 }
2737 
2738 //------------------------------vector_opd---------------------------
2739 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
vector_opd(Node_List * p,int opd_idx)2740 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
2741   Node* p0 = p->at(0);
2742   uint vlen = p->size();
2743   Node* opd = p0->in(opd_idx);
2744   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2745 
2746   if (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()) {
2747     // override vlen with the main loops vector length
2748     vlen = cl->slp_max_unroll();
2749   }
2750 
2751   if (same_inputs(p, opd_idx)) {
2752     if (opd->is_Vector() || opd->is_LoadVector()) {
2753       assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
2754       if (opd_idx == 2 && VectorNode::is_shift(p0)) {
2755         NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("shift's count can't be vector");})
2756         return NULL;
2757       }
2758       return opd; // input is matching vector
2759     }
2760     if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
2761       Compile* C = _phase->C;
2762       Node* cnt = opd;
2763       // Vector instructions do not mask shift count, do it here.
2764       juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
2765       const TypeInt* t = opd->find_int_type();
2766       if (t != NULL && t->is_con()) {
2767         juint shift = t->get_con();
2768         if (shift > mask) { // Unsigned cmp
2769           cnt = ConNode::make(TypeInt::make(shift & mask));
2770         }
2771       } else {
2772         if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) {
2773           cnt = ConNode::make(TypeInt::make(mask));
2774           _igvn.register_new_node_with_optimizer(cnt);
2775           cnt = new AndINode(opd, cnt);
2776           _igvn.register_new_node_with_optimizer(cnt);
2777           _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
2778         }
2779         assert(opd->bottom_type()->isa_int(), "int type only");
2780         if (!opd->bottom_type()->isa_int()) {
2781           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("Should be int type only");})
2782           return NULL;
2783         }
2784       }
2785       // Move shift count into vector register.
2786       cnt = VectorNode::shift_count(p0->Opcode(), cnt, vlen, velt_basic_type(p0));
2787       _igvn.register_new_node_with_optimizer(cnt);
2788       _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
2789       return cnt;
2790     }
2791     assert(!opd->is_StoreVector(), "such vector is not expected here");
2792     if (opd->is_StoreVector()) {
2793       NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("StoreVector is not expected here");})
2794       return NULL;
2795     }
2796     // Convert scalar input to vector with the same number of elements as
2797     // p0's vector. Use p0's type because size of operand's container in
2798     // vector should match p0's size regardless operand's size.
2799     const Type* p0_t = NULL;
2800     VectorNode* vn = NULL;
2801     if (opd_idx == 2 && VectorNode::is_scalar_rotate(p0)) {
2802        Node* conv = opd;
2803        p0_t =  TypeInt::INT;
2804        if (p0->bottom_type()->isa_long()) {
2805          p0_t = TypeLong::LONG;
2806          conv = new ConvI2LNode(opd);
2807          _igvn.register_new_node_with_optimizer(conv);
2808          _phase->set_ctrl(conv, _phase->get_ctrl(opd));
2809        }
2810        vn = VectorNode::scalar2vector(conv, vlen, p0_t);
2811     } else {
2812        p0_t =  velt_type(p0);
2813        vn = VectorNode::scalar2vector(opd, vlen, p0_t);
2814     }
2815 
2816     _igvn.register_new_node_with_optimizer(vn);
2817     _phase->set_ctrl(vn, _phase->get_ctrl(opd));
2818 #ifdef ASSERT
2819     if (TraceNewVectors) {
2820       tty->print("new Vector node: ");
2821       vn->dump();
2822     }
2823 #endif
2824     return vn;
2825   }
2826 
2827   // Insert pack operation
2828   BasicType bt = velt_basic_type(p0);
2829   PackNode* pk = PackNode::make(opd, vlen, bt);
2830   DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
2831 
2832   for (uint i = 1; i < vlen; i++) {
2833     Node* pi = p->at(i);
2834     Node* in = pi->in(opd_idx);
2835     assert(my_pack(in) == NULL, "Should already have been unpacked");
2836     if (my_pack(in) != NULL) {
2837       NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("Should already have been unpacked");})
2838       return NULL;
2839     }
2840     assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
2841     pk->add_opd(in);
2842     if (VectorNode::is_muladds2i(pi)) {
2843       Node* in2 = pi->in(opd_idx + 2);
2844       assert(my_pack(in2) == NULL, "Should already have been unpacked");
2845       if (my_pack(in2) != NULL) {
2846         NOT_PRODUCT(if (is_trace_loop_reverse() || TraceLoopOpts) { tty->print_cr("Should already have been unpacked"); })
2847           return NULL;
2848       }
2849       assert(opd_bt == in2->bottom_type()->basic_type(), "all same type");
2850       pk->add_opd(in2);
2851     }
2852   }
2853   _igvn.register_new_node_with_optimizer(pk);
2854   _phase->set_ctrl(pk, _phase->get_ctrl(opd));
2855 #ifdef ASSERT
2856   if (TraceNewVectors) {
2857     tty->print("new Vector node: ");
2858     pk->dump();
2859   }
2860 #endif
2861   return pk;
2862 }
2863 
2864 //------------------------------insert_extracts---------------------------
2865 // If a use of pack p is not a vector use, then replace the
2866 // use with an extract operation.
insert_extracts(Node_List * p)2867 void SuperWord::insert_extracts(Node_List* p) {
2868   if (p->at(0)->is_Store()) return;
2869   assert(_n_idx_list.is_empty(), "empty (node,index) list");
2870 
2871   // Inspect each use of each pack member.  For each use that is
2872   // not a vector use, replace the use with an extract operation.
2873 
2874   for (uint i = 0; i < p->size(); i++) {
2875     Node* def = p->at(i);
2876     for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
2877       Node* use = def->fast_out(j);
2878       for (uint k = 0; k < use->req(); k++) {
2879         Node* n = use->in(k);
2880         if (def == n) {
2881           Node_List* u_pk = my_pack(use);
2882           if ((u_pk == NULL || !is_cmov_pack(u_pk) || use->is_CMove()) && !is_vector_use(use, k)) {
2883               _n_idx_list.push(use, k);
2884           }
2885         }
2886       }
2887     }
2888   }
2889 
2890   while (_n_idx_list.is_nonempty()) {
2891     Node* use = _n_idx_list.node();
2892     int   idx = _n_idx_list.index();
2893     _n_idx_list.pop();
2894     Node* def = use->in(idx);
2895 
2896     if (def->is_reduction()) continue;
2897 
2898     // Insert extract operation
2899     _igvn.hash_delete(def);
2900     int def_pos = alignment(def) / data_size(def);
2901 
2902     Node* ex = ExtractNode::make(def, def_pos, velt_basic_type(def));
2903     _igvn.register_new_node_with_optimizer(ex);
2904     _phase->set_ctrl(ex, _phase->get_ctrl(def));
2905     _igvn.replace_input_of(use, idx, ex);
2906     _igvn._worklist.push(def);
2907 
2908     bb_insert_after(ex, bb_idx(def));
2909     set_velt_type(ex, velt_type(def));
2910   }
2911 }
2912 
2913 //------------------------------is_vector_use---------------------------
2914 // Is use->in(u_idx) a vector use?
is_vector_use(Node * use,int u_idx)2915 bool SuperWord::is_vector_use(Node* use, int u_idx) {
2916   Node_List* u_pk = my_pack(use);
2917   if (u_pk == NULL) return false;
2918   if (use->is_reduction()) return true;
2919   Node* def = use->in(u_idx);
2920   Node_List* d_pk = my_pack(def);
2921   if (d_pk == NULL) {
2922     // check for scalar promotion
2923     Node* n = u_pk->at(0)->in(u_idx);
2924     for (uint i = 1; i < u_pk->size(); i++) {
2925       if (u_pk->at(i)->in(u_idx) != n) return false;
2926     }
2927     return true;
2928   }
2929   if (VectorNode::is_muladds2i(use)) {
2930     // MulAddS2I takes shorts and produces ints - hence the special checks
2931     // on alignment and size.
2932     if (u_pk->size() * 2 != d_pk->size()) {
2933       return false;
2934     }
2935     for (uint i = 0; i < MIN2(d_pk->size(), u_pk->size()); i++) {
2936       Node* ui = u_pk->at(i);
2937       Node* di = d_pk->at(i);
2938       if (alignment(ui) != alignment(di) * 2) {
2939         return false;
2940       }
2941     }
2942     return true;
2943   }
2944   if (u_pk->size() != d_pk->size())
2945     return false;
2946   for (uint i = 0; i < u_pk->size(); i++) {
2947     Node* ui = u_pk->at(i);
2948     Node* di = d_pk->at(i);
2949     if (ui->in(u_idx) != di || alignment(ui) != alignment(di))
2950       return false;
2951   }
2952   return true;
2953 }
2954 
2955 //------------------------------construct_bb---------------------------
2956 // Construct reverse postorder list of block members
construct_bb()2957 bool SuperWord::construct_bb() {
2958   Node* entry = bb();
2959 
2960   assert(_stk.length() == 0,            "stk is empty");
2961   assert(_block.length() == 0,          "block is empty");
2962   assert(_data_entry.length() == 0,     "data_entry is empty");
2963   assert(_mem_slice_head.length() == 0, "mem_slice_head is empty");
2964   assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty");
2965 
2966   // Find non-control nodes with no inputs from within block,
2967   // create a temporary map from node _idx to bb_idx for use
2968   // by the visited and post_visited sets,
2969   // and count number of nodes in block.
2970   int bb_ct = 0;
2971   for (uint i = 0; i < lpt()->_body.size(); i++) {
2972     Node *n = lpt()->_body.at(i);
2973     set_bb_idx(n, i); // Create a temporary map
2974     if (in_bb(n)) {
2975       if (n->is_LoadStore() || n->is_MergeMem() ||
2976           (n->is_Proj() && !n->as_Proj()->is_CFG())) {
2977         // Bailout if the loop has LoadStore, MergeMem or data Proj
2978         // nodes. Superword optimization does not work with them.
2979         return false;
2980       }
2981       bb_ct++;
2982       if (!n->is_CFG()) {
2983         bool found = false;
2984         for (uint j = 0; j < n->req(); j++) {
2985           Node* def = n->in(j);
2986           if (def && in_bb(def)) {
2987             found = true;
2988             break;
2989           }
2990         }
2991         if (!found) {
2992           assert(n != entry, "can't be entry");
2993           _data_entry.push(n);
2994         }
2995       }
2996     }
2997   }
2998 
2999   // Find memory slices (head and tail)
3000   for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
3001     Node *n = lp()->fast_out(i);
3002     if (in_bb(n) && (n->is_Phi() && n->bottom_type() == Type::MEMORY)) {
3003       Node* n_tail  = n->in(LoopNode::LoopBackControl);
3004       if (n_tail != n->in(LoopNode::EntryControl)) {
3005         if (!n_tail->is_Mem()) {
3006           assert(n_tail->is_Mem(), "unexpected node for memory slice: %s", n_tail->Name());
3007           return false; // Bailout
3008         }
3009         _mem_slice_head.push(n);
3010         _mem_slice_tail.push(n_tail);
3011       }
3012     }
3013   }
3014 
3015   // Create an RPO list of nodes in block
3016 
3017   visited_clear();
3018   post_visited_clear();
3019 
3020   // Push all non-control nodes with no inputs from within block, then control entry
3021   for (int j = 0; j < _data_entry.length(); j++) {
3022     Node* n = _data_entry.at(j);
3023     visited_set(n);
3024     _stk.push(n);
3025   }
3026   visited_set(entry);
3027   _stk.push(entry);
3028 
3029   // Do a depth first walk over out edges
3030   int rpo_idx = bb_ct - 1;
3031   int size;
3032   int reduction_uses = 0;
3033   while ((size = _stk.length()) > 0) {
3034     Node* n = _stk.top(); // Leave node on stack
3035     if (!visited_test_set(n)) {
3036       // forward arc in graph
3037     } else if (!post_visited_test(n)) {
3038       // cross or back arc
3039       for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
3040         Node *use = n->fast_out(i);
3041         if (in_bb(use) && !visited_test(use) &&
3042             // Don't go around backedge
3043             (!use->is_Phi() || n == entry)) {
3044           if (use->is_reduction()) {
3045             // First see if we can map the reduction on the given system we are on, then
3046             // make a data entry operation for each reduction we see.
3047             BasicType bt = use->bottom_type()->basic_type();
3048             if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) {
3049               reduction_uses++;
3050             }
3051           }
3052           _stk.push(use);
3053         }
3054       }
3055       if (_stk.length() == size) {
3056         // There were no additional uses, post visit node now
3057         _stk.pop(); // Remove node from stack
3058         assert(rpo_idx >= 0, "");
3059         _block.at_put_grow(rpo_idx, n);
3060         rpo_idx--;
3061         post_visited_set(n);
3062         assert(rpo_idx >= 0 || _stk.is_empty(), "");
3063       }
3064     } else {
3065       _stk.pop(); // Remove post-visited node from stack
3066     }
3067   }//while
3068 
3069   int ii_current = -1;
3070   unsigned int load_idx = (unsigned int)-1;
3071   // Build iterations order if needed
3072   bool build_ii_order = _do_vector_loop_experimental && _ii_order.is_empty();
3073   // Create real map of block indices for nodes
3074   for (int j = 0; j < _block.length(); j++) {
3075     Node* n = _block.at(j);
3076     set_bb_idx(n, j);
3077     if (build_ii_order && n->is_Load()) {
3078       if (ii_current == -1) {
3079         ii_current = _clone_map.gen(n->_idx);
3080         _ii_order.push(ii_current);
3081         load_idx = _clone_map.idx(n->_idx);
3082       } else if (_clone_map.idx(n->_idx) == load_idx && _clone_map.gen(n->_idx) != ii_current) {
3083         ii_current = _clone_map.gen(n->_idx);
3084         _ii_order.push(ii_current);
3085       }
3086     }
3087   }//for
3088 
3089   // Ensure extra info is allocated.
3090   initialize_bb();
3091 
3092 #ifndef PRODUCT
3093   if (_vector_loop_debug && _ii_order.length() > 0) {
3094     tty->print("SuperWord::construct_bb: List of generations: ");
3095     for (int jj = 0; jj < _ii_order.length(); ++jj) {
3096       tty->print("  %d:%d", jj, _ii_order.at(jj));
3097     }
3098     tty->print_cr(" ");
3099   }
3100   if (TraceSuperWord) {
3101     print_bb();
3102     tty->print_cr("\ndata entry nodes: %s", _data_entry.length() > 0 ? "" : "NONE");
3103     for (int m = 0; m < _data_entry.length(); m++) {
3104       tty->print("%3d ", m);
3105       _data_entry.at(m)->dump();
3106     }
3107     tty->print_cr("\nmemory slices: %s", _mem_slice_head.length() > 0 ? "" : "NONE");
3108     for (int m = 0; m < _mem_slice_head.length(); m++) {
3109       tty->print("%3d ", m); _mem_slice_head.at(m)->dump();
3110       tty->print("    ");    _mem_slice_tail.at(m)->dump();
3111     }
3112   }
3113 #endif
3114   assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found");
3115   return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0);
3116 }
3117 
3118 //------------------------------initialize_bb---------------------------
3119 // Initialize per node info
initialize_bb()3120 void SuperWord::initialize_bb() {
3121   Node* last = _block.at(_block.length() - 1);
3122   grow_node_info(bb_idx(last));
3123 }
3124 
3125 //------------------------------bb_insert_after---------------------------
3126 // Insert n into block after pos
bb_insert_after(Node * n,int pos)3127 void SuperWord::bb_insert_after(Node* n, int pos) {
3128   int n_pos = pos + 1;
3129   // Make room
3130   for (int i = _block.length() - 1; i >= n_pos; i--) {
3131     _block.at_put_grow(i+1, _block.at(i));
3132   }
3133   for (int j = _node_info.length() - 1; j >= n_pos; j--) {
3134     _node_info.at_put_grow(j+1, _node_info.at(j));
3135   }
3136   // Set value
3137   _block.at_put_grow(n_pos, n);
3138   _node_info.at_put_grow(n_pos, SWNodeInfo::initial);
3139   // Adjust map from node->_idx to _block index
3140   for (int i = n_pos; i < _block.length(); i++) {
3141     set_bb_idx(_block.at(i), i);
3142   }
3143 }
3144 
3145 //------------------------------compute_max_depth---------------------------
3146 // Compute max depth for expressions from beginning of block
3147 // Use to prune search paths during test for independence.
compute_max_depth()3148 void SuperWord::compute_max_depth() {
3149   int ct = 0;
3150   bool again;
3151   do {
3152     again = false;
3153     for (int i = 0; i < _block.length(); i++) {
3154       Node* n = _block.at(i);
3155       if (!n->is_Phi()) {
3156         int d_orig = depth(n);
3157         int d_in   = 0;
3158         for (DepPreds preds(n, _dg); !preds.done(); preds.next()) {
3159           Node* pred = preds.current();
3160           if (in_bb(pred)) {
3161             d_in = MAX2(d_in, depth(pred));
3162           }
3163         }
3164         if (d_in + 1 != d_orig) {
3165           set_depth(n, d_in + 1);
3166           again = true;
3167         }
3168       }
3169     }
3170     ct++;
3171   } while (again);
3172 
3173   if (TraceSuperWord && Verbose) {
3174     tty->print_cr("compute_max_depth iterated: %d times", ct);
3175   }
3176 }
3177 
3178 //-------------------------compute_vector_element_type-----------------------
3179 // Compute necessary vector element type for expressions
3180 // This propagates backwards a narrower integer type when the
3181 // upper bits of the value are not needed.
3182 // Example:  char a,b,c;  a = b + c;
3183 // Normally the type of the add is integer, but for packed character
3184 // operations the type of the add needs to be char.
compute_vector_element_type()3185 void SuperWord::compute_vector_element_type() {
3186   if (TraceSuperWord && Verbose) {
3187     tty->print_cr("\ncompute_velt_type:");
3188   }
3189 
3190   // Initial type
3191   for (int i = 0; i < _block.length(); i++) {
3192     Node* n = _block.at(i);
3193     set_velt_type(n, container_type(n));
3194   }
3195 
3196   // Propagate integer narrowed type backwards through operations
3197   // that don't depend on higher order bits
3198   for (int i = _block.length() - 1; i >= 0; i--) {
3199     Node* n = _block.at(i);
3200     // Only integer types need be examined
3201     const Type* vtn = velt_type(n);
3202     if (vtn->basic_type() == T_INT) {
3203       uint start, end;
3204       VectorNode::vector_operands(n, &start, &end);
3205 
3206       for (uint j = start; j < end; j++) {
3207         Node* in  = n->in(j);
3208         // Don't propagate through a memory
3209         if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
3210             data_size(n) < data_size(in)) {
3211           bool same_type = true;
3212           for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
3213             Node *use = in->fast_out(k);
3214             if (!in_bb(use) || !same_velt_type(use, n)) {
3215               same_type = false;
3216               break;
3217             }
3218           }
3219           if (same_type) {
3220             // In any Java arithmetic operation, operands of small integer types
3221             // (boolean, byte, char & short) should be promoted to int first. As
3222             // vector elements of small types don't have upper bits of int, for
3223             // RShiftI or AbsI operations, the compiler has to know the precise
3224             // signedness info of the 1st operand. These operations shouldn't be
3225             // vectorized if the signedness info is imprecise.
3226             const Type* vt = vtn;
3227             int op = in->Opcode();
3228             if (VectorNode::is_shift_opcode(op) || op == Op_AbsI) {
3229               Node* load = in->in(1);
3230               if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) {
3231                 // Only Load nodes distinguish signed (LoadS/LoadB) and unsigned
3232                 // (LoadUS/LoadUB) values. Store nodes only have one version.
3233                 vt = velt_type(load);
3234               } else if (op != Op_LShiftI) {
3235                 // Widen type to int to avoid the creation of vector nodes. Note
3236                 // that left shifts work regardless of the signedness.
3237                 vt = TypeInt::INT;
3238               }
3239             }
3240             set_velt_type(in, vt);
3241           }
3242         }
3243       }
3244     }
3245   }
3246 #ifndef PRODUCT
3247   if (TraceSuperWord && Verbose) {
3248     for (int i = 0; i < _block.length(); i++) {
3249       Node* n = _block.at(i);
3250       velt_type(n)->dump();
3251       tty->print("\t");
3252       n->dump();
3253     }
3254   }
3255 #endif
3256 }
3257 
3258 //------------------------------memory_alignment---------------------------
3259 // Alignment within a vector memory reference
memory_alignment(MemNode * s,int iv_adjust)3260 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
3261 #ifndef PRODUCT
3262   if ((TraceSuperWord && Verbose) || is_trace_alignment()) {
3263     tty->print("SuperWord::memory_alignment within a vector memory reference for %d:  ", s->_idx); s->dump();
3264   }
3265 #endif
3266   NOT_PRODUCT(SWPointer::Tracer::Depth ddd(0);)
3267   SWPointer p(s, this, NULL, false);
3268   if (!p.valid()) {
3269     NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align");)
3270     return bottom_align;
3271   }
3272   int vw = get_vw_bytes_special(s);
3273   if (vw < 2) {
3274     NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
3275     return bottom_align; // No vectors for this type
3276   }
3277   int offset  = p.offset_in_bytes();
3278   offset     += iv_adjust*p.memory_size();
3279   int off_rem = offset % vw;
3280   int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
3281 #ifndef PRODUCT
3282   if ((TraceSuperWord && Verbose) || is_trace_alignment()) {
3283     tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d", off_rem, off_mod);
3284   }
3285 #endif
3286   return off_mod;
3287 }
3288 
3289 //---------------------------container_type---------------------------
3290 // Smallest type containing range of values
container_type(Node * n)3291 const Type* SuperWord::container_type(Node* n) {
3292   if (n->is_Mem()) {
3293     BasicType bt = n->as_Mem()->memory_type();
3294     if (n->is_Store() && (bt == T_CHAR)) {
3295       // Use T_SHORT type instead of T_CHAR for stored values because any
3296       // preceding arithmetic operation extends values to signed Int.
3297       bt = T_SHORT;
3298     }
3299     if (n->Opcode() == Op_LoadUB) {
3300       // Adjust type for unsigned byte loads, it is important for right shifts.
3301       // T_BOOLEAN is used because there is no basic type representing type
3302       // TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only
3303       // size (one byte) and sign is important.
3304       bt = T_BOOLEAN;
3305     }
3306     return Type::get_const_basic_type(bt);
3307   }
3308   const Type* t = _igvn.type(n);
3309   if (t->basic_type() == T_INT) {
3310     // A narrow type of arithmetic operations will be determined by
3311     // propagating the type of memory operations.
3312     return TypeInt::INT;
3313   }
3314   return t;
3315 }
3316 
same_velt_type(Node * n1,Node * n2)3317 bool SuperWord::same_velt_type(Node* n1, Node* n2) {
3318   const Type* vt1 = velt_type(n1);
3319   const Type* vt2 = velt_type(n2);
3320   if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
3321     // Compare vectors element sizes for integer types.
3322     return data_size(n1) == data_size(n2);
3323   }
3324   return vt1 == vt2;
3325 }
3326 
3327 //------------------------------in_packset---------------------------
3328 // Are s1 and s2 in a pack pair and ordered as s1,s2?
in_packset(Node * s1,Node * s2)3329 bool SuperWord::in_packset(Node* s1, Node* s2) {
3330   for (int i = 0; i < _packset.length(); i++) {
3331     Node_List* p = _packset.at(i);
3332     assert(p->size() == 2, "must be");
3333     if (p->at(0) == s1 && p->at(p->size()-1) == s2) {
3334       return true;
3335     }
3336   }
3337   return false;
3338 }
3339 
3340 //------------------------------in_pack---------------------------
3341 // Is s in pack p?
in_pack(Node * s,Node_List * p)3342 Node_List* SuperWord::in_pack(Node* s, Node_List* p) {
3343   for (uint i = 0; i < p->size(); i++) {
3344     if (p->at(i) == s) {
3345       return p;
3346     }
3347   }
3348   return NULL;
3349 }
3350 
3351 //------------------------------remove_pack_at---------------------------
3352 // Remove the pack at position pos in the packset
remove_pack_at(int pos)3353 void SuperWord::remove_pack_at(int pos) {
3354   Node_List* p = _packset.at(pos);
3355   for (uint i = 0; i < p->size(); i++) {
3356     Node* s = p->at(i);
3357     set_my_pack(s, NULL);
3358   }
3359   _packset.remove_at(pos);
3360 }
3361 
packset_sort(int n)3362 void SuperWord::packset_sort(int n) {
3363   // simple bubble sort so that we capitalize with O(n) when its already sorted
3364   while (n != 0) {
3365     bool swapped = false;
3366     for (int i = 1; i < n; i++) {
3367       Node_List* q_low = _packset.at(i-1);
3368       Node_List* q_i = _packset.at(i);
3369 
3370       // only swap when we find something to swap
3371       if (alignment(q_low->at(0)) > alignment(q_i->at(0))) {
3372         Node_List* t = q_i;
3373         *(_packset.adr_at(i)) = q_low;
3374         *(_packset.adr_at(i-1)) = q_i;
3375         swapped = true;
3376       }
3377     }
3378     if (swapped == false) break;
3379     n--;
3380   }
3381 }
3382 
3383 //------------------------------executed_first---------------------------
3384 // Return the node executed first in pack p.  Uses the RPO block list
3385 // to determine order.
executed_first(Node_List * p)3386 Node* SuperWord::executed_first(Node_List* p) {
3387   Node* n = p->at(0);
3388   int n_rpo = bb_idx(n);
3389   for (uint i = 1; i < p->size(); i++) {
3390     Node* s = p->at(i);
3391     int s_rpo = bb_idx(s);
3392     if (s_rpo < n_rpo) {
3393       n = s;
3394       n_rpo = s_rpo;
3395     }
3396   }
3397   return n;
3398 }
3399 
3400 //------------------------------executed_last---------------------------
3401 // Return the node executed last in pack p.
executed_last(Node_List * p)3402 Node* SuperWord::executed_last(Node_List* p) {
3403   Node* n = p->at(0);
3404   int n_rpo = bb_idx(n);
3405   for (uint i = 1; i < p->size(); i++) {
3406     Node* s = p->at(i);
3407     int s_rpo = bb_idx(s);
3408     if (s_rpo > n_rpo) {
3409       n = s;
3410       n_rpo = s_rpo;
3411     }
3412   }
3413   return n;
3414 }
3415 
control_dependency(Node_List * p)3416 LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
3417   LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
3418   for (uint i = 0; i < p->size(); i++) {
3419     Node* n = p->at(i);
3420     assert(n->is_Load(), "only meaningful for loads");
3421     if (!n->depends_only_on_test()) {
3422       if (n->as_Load()->has_unknown_control_dependency() &&
3423           dep != LoadNode::Pinned) {
3424         // Upgrade to unknown control...
3425         dep = LoadNode::UnknownControl;
3426       } else {
3427         // Otherwise, we must pin it.
3428         dep = LoadNode::Pinned;
3429       }
3430     }
3431   }
3432   return dep;
3433 }
3434 
3435 
3436 //----------------------------align_initial_loop_index---------------------------
3437 // Adjust pre-loop limit so that in main loop, a load/store reference
3438 // to align_to_ref will be a position zero in the vector.
3439 //   (iv + k) mod vector_align == 0
align_initial_loop_index(MemNode * align_to_ref)3440 void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
3441   assert(lp()->is_main_loop(), "");
3442   CountedLoopEndNode* pre_end = pre_loop_end();
3443   Node* pre_opaq1 = pre_end->limit();
3444   assert(pre_opaq1->Opcode() == Op_Opaque1, "");
3445   Opaque1Node* pre_opaq = (Opaque1Node*)pre_opaq1;
3446   Node* lim0 = pre_opaq->in(1);
3447 
3448   // Where we put new limit calculations
3449   Node* pre_ctrl = pre_loop_head()->in(LoopNode::EntryControl);
3450 
3451   // Ensure the original loop limit is available from the
3452   // pre-loop Opaque1 node.
3453   Node* orig_limit = pre_opaq->original_loop_limit();
3454   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
3455 
3456   SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
3457   assert(align_to_ref_p.valid(), "sanity");
3458 
3459   // Given:
3460   //     lim0 == original pre loop limit
3461   //     V == v_align (power of 2)
3462   //     invar == extra invariant piece of the address expression
3463   //     e == offset [ +/- invar ]
3464   //
3465   // When reassociating expressions involving '%' the basic rules are:
3466   //     (a - b) % k == 0   =>  a % k == b % k
3467   // and:
3468   //     (a + b) % k == 0   =>  a % k == (k - b) % k
3469   //
3470   // For stride > 0 && scale > 0,
3471   //   Derive the new pre-loop limit "lim" such that the two constraints:
3472   //     (1) lim = lim0 + N           (where N is some positive integer < V)
3473   //     (2) (e + lim) % V == 0
3474   //   are true.
3475   //
3476   //   Substituting (1) into (2),
3477   //     (e + lim0 + N) % V == 0
3478   //   solve for N:
3479   //     N = (V - (e + lim0)) % V
3480   //   substitute back into (1), so that new limit
3481   //     lim = lim0 + (V - (e + lim0)) % V
3482   //
3483   // For stride > 0 && scale < 0
3484   //   Constraints:
3485   //     lim = lim0 + N
3486   //     (e - lim) % V == 0
3487   //   Solving for lim:
3488   //     (e - lim0 - N) % V == 0
3489   //     N = (e - lim0) % V
3490   //     lim = lim0 + (e - lim0) % V
3491   //
3492   // For stride < 0 && scale > 0
3493   //   Constraints:
3494   //     lim = lim0 - N
3495   //     (e + lim) % V == 0
3496   //   Solving for lim:
3497   //     (e + lim0 - N) % V == 0
3498   //     N = (e + lim0) % V
3499   //     lim = lim0 - (e + lim0) % V
3500   //
3501   // For stride < 0 && scale < 0
3502   //   Constraints:
3503   //     lim = lim0 - N
3504   //     (e - lim) % V == 0
3505   //   Solving for lim:
3506   //     (e - lim0 + N) % V == 0
3507   //     N = (V - (e - lim0)) % V
3508   //     lim = lim0 - (V - (e - lim0)) % V
3509 
3510   int vw = vector_width_in_bytes(align_to_ref);
3511   int stride   = iv_stride();
3512   int scale    = align_to_ref_p.scale_in_bytes();
3513   int elt_size = align_to_ref_p.memory_size();
3514   int v_align  = vw / elt_size;
3515   assert(v_align > 1, "sanity");
3516   int offset   = align_to_ref_p.offset_in_bytes() / elt_size;
3517   Node *offsn  = _igvn.intcon(offset);
3518 
3519   Node *e = offsn;
3520   if (align_to_ref_p.invar() != NULL) {
3521     // incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt)
3522     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
3523     Node* invar = align_to_ref_p.invar();
3524     if (_igvn.type(invar)->isa_long()) {
3525       // Computations are done % (vector width/element size) so it's
3526       // safe to simply convert invar to an int and loose the upper 32
3527       // bit half.
3528       invar = new ConvL2INode(invar);
3529       _igvn.register_new_node_with_optimizer(invar);
3530     }
3531     Node* invar_scale = align_to_ref_p.invar_scale();
3532     if (invar_scale != NULL) {
3533       invar = new LShiftINode(invar, invar_scale);
3534       _igvn.register_new_node_with_optimizer(invar);
3535     }
3536     Node* aref = new URShiftINode(invar, log2_elt);
3537     _igvn.register_new_node_with_optimizer(aref);
3538     _phase->set_ctrl(aref, pre_ctrl);
3539     if (align_to_ref_p.negate_invar()) {
3540       e = new SubINode(e, aref);
3541     } else {
3542       e = new AddINode(e, aref);
3543     }
3544     _igvn.register_new_node_with_optimizer(e);
3545     _phase->set_ctrl(e, pre_ctrl);
3546   }
3547   if (vw > ObjectAlignmentInBytes || align_to_ref_p.base()->is_top()) {
3548     // incorporate base e +/- base && Mask >>> log2(elt)
3549     Node* xbase = new CastP2XNode(NULL, align_to_ref_p.adr());
3550     _igvn.register_new_node_with_optimizer(xbase);
3551 #ifdef _LP64
3552     xbase  = new ConvL2INode(xbase);
3553     _igvn.register_new_node_with_optimizer(xbase);
3554 #endif
3555     Node* mask = _igvn.intcon(vw-1);
3556     Node* masked_xbase  = new AndINode(xbase, mask);
3557     _igvn.register_new_node_with_optimizer(masked_xbase);
3558     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
3559     Node* bref     = new URShiftINode(masked_xbase, log2_elt);
3560     _igvn.register_new_node_with_optimizer(bref);
3561     _phase->set_ctrl(bref, pre_ctrl);
3562     e = new AddINode(e, bref);
3563     _igvn.register_new_node_with_optimizer(e);
3564     _phase->set_ctrl(e, pre_ctrl);
3565   }
3566 
3567   // compute e +/- lim0
3568   if (scale < 0) {
3569     e = new SubINode(e, lim0);
3570   } else {
3571     e = new AddINode(e, lim0);
3572   }
3573   _igvn.register_new_node_with_optimizer(e);
3574   _phase->set_ctrl(e, pre_ctrl);
3575 
3576   if (stride * scale > 0) {
3577     // compute V - (e +/- lim0)
3578     Node* va  = _igvn.intcon(v_align);
3579     e = new SubINode(va, e);
3580     _igvn.register_new_node_with_optimizer(e);
3581     _phase->set_ctrl(e, pre_ctrl);
3582   }
3583   // compute N = (exp) % V
3584   Node* va_msk = _igvn.intcon(v_align - 1);
3585   Node* N = new AndINode(e, va_msk);
3586   _igvn.register_new_node_with_optimizer(N);
3587   _phase->set_ctrl(N, pre_ctrl);
3588 
3589   //   substitute back into (1), so that new limit
3590   //     lim = lim0 + N
3591   Node* lim;
3592   if (stride < 0) {
3593     lim = new SubINode(lim0, N);
3594   } else {
3595     lim = new AddINode(lim0, N);
3596   }
3597   _igvn.register_new_node_with_optimizer(lim);
3598   _phase->set_ctrl(lim, pre_ctrl);
3599   Node* constrained =
3600     (stride > 0) ? (Node*) new MinINode(lim, orig_limit)
3601                  : (Node*) new MaxINode(lim, orig_limit);
3602   _igvn.register_new_node_with_optimizer(constrained);
3603   _phase->set_ctrl(constrained, pre_ctrl);
3604   _igvn.replace_input_of(pre_opaq, 1, constrained);
3605 }
3606 
3607 //----------------------------get_pre_loop_end---------------------------
3608 // Find pre loop end from main loop.  Returns null if none.
find_pre_loop_end(CountedLoopNode * cl) const3609 CountedLoopEndNode* SuperWord::find_pre_loop_end(CountedLoopNode* cl) const {
3610   // The loop cannot be optimized if the graph shape at
3611   // the loop entry is inappropriate.
3612   if (!PhaseIdealLoop::is_canonical_loop_entry(cl)) {
3613     return NULL;
3614   }
3615 
3616   Node* p_f = cl->skip_predicates()->in(0)->in(0);
3617   if (!p_f->is_IfFalse()) return NULL;
3618   if (!p_f->in(0)->is_CountedLoopEnd()) return NULL;
3619   CountedLoopEndNode* pre_end = p_f->in(0)->as_CountedLoopEnd();
3620   CountedLoopNode* loop_node = pre_end->loopnode();
3621   if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL;
3622   return pre_end;
3623 }
3624 
3625 //------------------------------init---------------------------
init()3626 void SuperWord::init() {
3627   _dg.init();
3628   _packset.clear();
3629   _disjoint_ptrs.clear();
3630   _block.clear();
3631   _post_block.clear();
3632   _data_entry.clear();
3633   _mem_slice_head.clear();
3634   _mem_slice_tail.clear();
3635   _iteration_first.clear();
3636   _iteration_last.clear();
3637   _node_info.clear();
3638   _align_to_ref = NULL;
3639   _lpt = NULL;
3640   _lp = NULL;
3641   _bb = NULL;
3642   _iv = NULL;
3643   _race_possible = 0;
3644   _early_return = false;
3645   _num_work_vecs = 0;
3646   _num_reductions = 0;
3647 }
3648 
3649 //------------------------------restart---------------------------
restart()3650 void SuperWord::restart() {
3651   _dg.init();
3652   _packset.clear();
3653   _disjoint_ptrs.clear();
3654   _block.clear();
3655   _post_block.clear();
3656   _data_entry.clear();
3657   _mem_slice_head.clear();
3658   _mem_slice_tail.clear();
3659   _node_info.clear();
3660 }
3661 
3662 //------------------------------print_packset---------------------------
print_packset()3663 void SuperWord::print_packset() {
3664 #ifndef PRODUCT
3665   tty->print_cr("packset");
3666   for (int i = 0; i < _packset.length(); i++) {
3667     tty->print_cr("Pack: %d", i);
3668     Node_List* p = _packset.at(i);
3669     print_pack(p);
3670   }
3671 #endif
3672 }
3673 
3674 //------------------------------print_pack---------------------------
print_pack(Node_List * p)3675 void SuperWord::print_pack(Node_List* p) {
3676   for (uint i = 0; i < p->size(); i++) {
3677     print_stmt(p->at(i));
3678   }
3679 }
3680 
3681 //------------------------------print_bb---------------------------
print_bb()3682 void SuperWord::print_bb() {
3683 #ifndef PRODUCT
3684   tty->print_cr("\nBlock");
3685   for (int i = 0; i < _block.length(); i++) {
3686     Node* n = _block.at(i);
3687     tty->print("%d ", i);
3688     if (n) {
3689       n->dump();
3690     }
3691   }
3692 #endif
3693 }
3694 
3695 //------------------------------print_stmt---------------------------
print_stmt(Node * s)3696 void SuperWord::print_stmt(Node* s) {
3697 #ifndef PRODUCT
3698   tty->print(" align: %d \t", alignment(s));
3699   s->dump();
3700 #endif
3701 }
3702 
3703 //------------------------------blank---------------------------
blank(uint depth)3704 char* SuperWord::blank(uint depth) {
3705   static char blanks[101];
3706   assert(depth < 101, "too deep");
3707   for (uint i = 0; i < depth; i++) blanks[i] = ' ';
3708   blanks[depth] = '\0';
3709   return blanks;
3710 }
3711 
3712 
3713 //==============================SWPointer===========================
3714 #ifndef PRODUCT
3715 int SWPointer::Tracer::_depth = 0;
3716 #endif
3717 //----------------------------SWPointer------------------------
SWPointer(MemNode * mem,SuperWord * slp,Node_Stack * nstack,bool analyze_only)3718 SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
3719   _mem(mem), _slp(slp),  _base(NULL),  _adr(NULL),
3720   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
3721   _invar_scale(NULL),
3722   _nstack(nstack), _analyze_only(analyze_only),
3723   _stack_idx(0)
3724 #ifndef PRODUCT
3725   , _tracer(slp)
3726 #endif
3727 {
3728   NOT_PRODUCT(_tracer.ctor_1(mem);)
3729 
3730   Node* adr = mem->in(MemNode::Address);
3731   if (!adr->is_AddP()) {
3732     assert(!valid(), "too complex");
3733     return;
3734   }
3735   // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
3736   Node* base = adr->in(AddPNode::Base);
3737   // The base address should be loop invariant
3738   if (is_main_loop_member(base)) {
3739     assert(!valid(), "base address is loop variant");
3740     return;
3741   }
3742   // unsafe references require misaligned vector access support
3743   if (base->is_top() && !Matcher::misaligned_vectors_ok()) {
3744     assert(!valid(), "unsafe access");
3745     return;
3746   }
3747 
3748   NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.store_depth();)
3749   NOT_PRODUCT(_tracer.ctor_2(adr);)
3750 
3751   int i;
3752   for (i = 0; i < 3; i++) {
3753     NOT_PRODUCT(_tracer.ctor_3(adr, i);)
3754 
3755     if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
3756       assert(!valid(), "too complex");
3757       return;
3758     }
3759     adr = adr->in(AddPNode::Address);
3760     NOT_PRODUCT(_tracer.ctor_4(adr, i);)
3761 
3762     if (base == adr || !adr->is_AddP()) {
3763       NOT_PRODUCT(_tracer.ctor_5(adr, base, i);)
3764       break; // stop looking at addp's
3765     }
3766   }
3767   if (is_main_loop_member(adr)) {
3768     assert(!valid(), "adr is loop variant");
3769     return;
3770   }
3771 
3772   if (!base->is_top() && adr != base) {
3773     assert(!valid(), "adr and base differ");
3774     return;
3775   }
3776 
3777   NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.restore_depth();)
3778   NOT_PRODUCT(_tracer.ctor_6(mem);)
3779 
3780   _base = base;
3781   _adr  = adr;
3782   assert(valid(), "Usable");
3783 }
3784 
3785 // Following is used to create a temporary object during
3786 // the pattern match of an address expression.
SWPointer(SWPointer * p)3787 SWPointer::SWPointer(SWPointer* p) :
3788   _mem(p->_mem), _slp(p->_slp),  _base(NULL),  _adr(NULL),
3789   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
3790   _invar_scale(NULL),
3791   _nstack(p->_nstack), _analyze_only(p->_analyze_only),
3792   _stack_idx(p->_stack_idx)
3793   #ifndef PRODUCT
3794   , _tracer(p->_slp)
3795   #endif
3796 {}
3797 
is_main_loop_member(Node * n) const3798 bool SWPointer::is_main_loop_member(Node* n) const {
3799   Node* n_c = phase()->get_ctrl(n);
3800   return lpt()->is_member(phase()->get_loop(n_c));
3801 }
3802 
invariant(Node * n) const3803 bool SWPointer::invariant(Node* n) const {
3804   NOT_PRODUCT(Tracer::Depth dd;)
3805   Node* n_c = phase()->get_ctrl(n);
3806   NOT_PRODUCT(_tracer.invariant_1(n, n_c);)
3807   bool is_not_member = !is_main_loop_member(n);
3808   if (is_not_member && _slp->lp()->is_main_loop()) {
3809     // Check that n_c dominates the pre loop head node. If it does not, then we cannot use n as invariant for the pre loop
3810     // CountedLoopEndNode check because n_c is either part of the pre loop or between the pre and the main loop (illegal
3811     // invariant: Happens, for example, when n_c is a CastII node that prevents data nodes to flow above the main loop).
3812     return phase()->is_dominator(n_c, _slp->pre_loop_head());
3813   }
3814   return is_not_member;
3815 }
3816 
3817 //------------------------scaled_iv_plus_offset--------------------
3818 // Match: k*iv + offset
3819 // where: k is a constant that maybe zero, and
3820 //        offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
scaled_iv_plus_offset(Node * n)3821 bool SWPointer::scaled_iv_plus_offset(Node* n) {
3822   NOT_PRODUCT(Tracer::Depth ddd;)
3823   NOT_PRODUCT(_tracer.scaled_iv_plus_offset_1(n);)
3824 
3825   if (scaled_iv(n)) {
3826     NOT_PRODUCT(_tracer.scaled_iv_plus_offset_2(n);)
3827     return true;
3828   }
3829 
3830   if (offset_plus_k(n)) {
3831     NOT_PRODUCT(_tracer.scaled_iv_plus_offset_3(n);)
3832     return true;
3833   }
3834 
3835   int opc = n->Opcode();
3836   if (opc == Op_AddI) {
3837     if (offset_plus_k(n->in(2)) && scaled_iv_plus_offset(n->in(1))) {
3838       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_4(n);)
3839       return true;
3840     }
3841     if (offset_plus_k(n->in(1)) && scaled_iv_plus_offset(n->in(2))) {
3842       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_5(n);)
3843       return true;
3844     }
3845   } else if (opc == Op_SubI) {
3846     if (offset_plus_k(n->in(2), true) && scaled_iv_plus_offset(n->in(1))) {
3847       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_6(n);)
3848       return true;
3849     }
3850     if (offset_plus_k(n->in(1)) && scaled_iv_plus_offset(n->in(2))) {
3851       _scale *= -1;
3852       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_7(n);)
3853       return true;
3854     }
3855   }
3856 
3857   NOT_PRODUCT(_tracer.scaled_iv_plus_offset_8(n);)
3858   return false;
3859 }
3860 
3861 //----------------------------scaled_iv------------------------
3862 // Match: k*iv where k is a constant that's not zero
scaled_iv(Node * n)3863 bool SWPointer::scaled_iv(Node* n) {
3864   NOT_PRODUCT(Tracer::Depth ddd;)
3865   NOT_PRODUCT(_tracer.scaled_iv_1(n);)
3866 
3867   if (_scale != 0) { // already found a scale
3868     NOT_PRODUCT(_tracer.scaled_iv_2(n, _scale);)
3869     return false;
3870   }
3871 
3872   if (n == iv()) {
3873     _scale = 1;
3874     NOT_PRODUCT(_tracer.scaled_iv_3(n, _scale);)
3875     return true;
3876   }
3877   if (_analyze_only && (is_main_loop_member(n))) {
3878     _nstack->push(n, _stack_idx++);
3879   }
3880 
3881   int opc = n->Opcode();
3882   if (opc == Op_MulI) {
3883     if (n->in(1) == iv() && n->in(2)->is_Con()) {
3884       _scale = n->in(2)->get_int();
3885       NOT_PRODUCT(_tracer.scaled_iv_4(n, _scale);)
3886       return true;
3887     } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
3888       _scale = n->in(1)->get_int();
3889       NOT_PRODUCT(_tracer.scaled_iv_5(n, _scale);)
3890       return true;
3891     }
3892   } else if (opc == Op_LShiftI) {
3893     if (n->in(1) == iv() && n->in(2)->is_Con()) {
3894       _scale = 1 << n->in(2)->get_int();
3895       NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);)
3896       return true;
3897     }
3898   } else if (opc == Op_ConvI2L) {
3899     if (n->in(1)->Opcode() == Op_CastII &&
3900         n->in(1)->as_CastII()->has_range_check()) {
3901       // Skip range check dependent CastII nodes
3902       n = n->in(1);
3903     }
3904     if (scaled_iv_plus_offset(n->in(1))) {
3905       NOT_PRODUCT(_tracer.scaled_iv_7(n);)
3906       return true;
3907     }
3908   } else if (opc == Op_LShiftL && n->in(2)->is_Con()) {
3909     if (!has_iv() && _invar == NULL) {
3910       // Need to preserve the current _offset value, so
3911       // create a temporary object for this expression subtree.
3912       // Hacky, so should re-engineer the address pattern match.
3913       NOT_PRODUCT(Tracer::Depth dddd;)
3914       SWPointer tmp(this);
3915       NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);)
3916 
3917       if (tmp.scaled_iv_plus_offset(n->in(1))) {
3918         int scale = n->in(2)->get_int();
3919         _scale   = tmp._scale  << scale;
3920         _offset += tmp._offset << scale;
3921         _invar = tmp._invar;
3922         if (_invar != NULL) {
3923           _negate_invar = tmp._negate_invar;
3924           _invar_scale = n->in(2);
3925         }
3926         NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, _invar, _negate_invar);)
3927         return true;
3928       }
3929     }
3930   }
3931   NOT_PRODUCT(_tracer.scaled_iv_10(n);)
3932   return false;
3933 }
3934 
3935 //----------------------------offset_plus_k------------------------
3936 // Match: offset is (k [+/- invariant])
3937 // where k maybe zero and invariant is optional, but not both.
offset_plus_k(Node * n,bool negate)3938 bool SWPointer::offset_plus_k(Node* n, bool negate) {
3939   NOT_PRODUCT(Tracer::Depth ddd;)
3940   NOT_PRODUCT(_tracer.offset_plus_k_1(n);)
3941 
3942   int opc = n->Opcode();
3943   if (opc == Op_ConI) {
3944     _offset += negate ? -(n->get_int()) : n->get_int();
3945     NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);)
3946     return true;
3947   } else if (opc == Op_ConL) {
3948     // Okay if value fits into an int
3949     const TypeLong* t = n->find_long_type();
3950     if (t->higher_equal(TypeLong::INT)) {
3951       jlong loff = n->get_long();
3952       jint  off  = (jint)loff;
3953       _offset += negate ? -off : loff;
3954       NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);)
3955       return true;
3956     }
3957     NOT_PRODUCT(_tracer.offset_plus_k_4(n);)
3958     return false;
3959   }
3960   if (_invar != NULL) { // already has an invariant
3961     NOT_PRODUCT(_tracer.offset_plus_k_5(n, _invar);)
3962     return false;
3963   }
3964 
3965   if (_analyze_only && is_main_loop_member(n)) {
3966     _nstack->push(n, _stack_idx++);
3967   }
3968   if (opc == Op_AddI) {
3969     if (n->in(2)->is_Con() && invariant(n->in(1))) {
3970       _negate_invar = negate;
3971       _invar = n->in(1);
3972       _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
3973       NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, _negate_invar, _offset);)
3974       return true;
3975     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
3976       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
3977       _negate_invar = negate;
3978       _invar = n->in(2);
3979       NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, _negate_invar, _offset);)
3980       return true;
3981     }
3982   }
3983   if (opc == Op_SubI) {
3984     if (n->in(2)->is_Con() && invariant(n->in(1))) {
3985       _negate_invar = negate;
3986       _invar = n->in(1);
3987       _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
3988       NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, _negate_invar, _offset);)
3989       return true;
3990     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
3991       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
3992       _negate_invar = !negate;
3993       _invar = n->in(2);
3994       NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, _negate_invar, _offset);)
3995       return true;
3996     }
3997   }
3998 
3999   if (!is_main_loop_member(n)) {
4000     // 'n' is loop invariant. Skip range check dependent CastII nodes before checking if 'n' is dominating the pre loop.
4001     if (opc == Op_ConvI2L) {
4002       n = n->in(1);
4003       if (n->Opcode() == Op_CastII &&
4004           n->as_CastII()->has_range_check()) {
4005         // Skip range check dependent CastII nodes
4006         assert(!is_main_loop_member(n), "sanity");
4007         n = n->in(1);
4008       }
4009     }
4010     // Check if 'n' can really be used as invariant (not in main loop and dominating the pre loop).
4011     if (invariant(n)) {
4012       _negate_invar = negate;
4013       _invar = n;
4014       NOT_PRODUCT(_tracer.offset_plus_k_10(n, _invar, _negate_invar, _offset);)
4015       return true;
4016     }
4017   }
4018 
4019   NOT_PRODUCT(_tracer.offset_plus_k_11(n);)
4020   return false;
4021 }
4022 
4023 //----------------------------print------------------------
print()4024 void SWPointer::print() {
4025 #ifndef PRODUCT
4026   tty->print("base: [%d]  adr: [%d]  scale: %d  offset: %d",
4027              _base != NULL ? _base->_idx : 0,
4028              _adr  != NULL ? _adr->_idx  : 0,
4029              _scale, _offset);
4030   if (_invar != NULL) {
4031     tty->print("  invar: %c[%d] << [%d]", _negate_invar?'-':'+', _invar->_idx, _invar_scale->_idx);
4032   }
4033   tty->cr();
4034 #endif
4035 }
4036 
4037 //----------------------------tracing------------------------
4038 #ifndef PRODUCT
print_depth() const4039 void SWPointer::Tracer::print_depth() const {
4040   for (int ii = 0; ii < _depth; ++ii) {
4041     tty->print("  ");
4042   }
4043 }
4044 
ctor_1(Node * mem)4045 void SWPointer::Tracer::ctor_1 (Node* mem) {
4046   if(_slp->is_trace_alignment()) {
4047     print_depth(); tty->print(" %d SWPointer::SWPointer: start alignment analysis", mem->_idx); mem->dump();
4048   }
4049 }
4050 
ctor_2(Node * adr)4051 void SWPointer::Tracer::ctor_2(Node* adr) {
4052   if(_slp->is_trace_alignment()) {
4053     //store_depth();
4054     inc_depth();
4055     print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: ", adr->_idx); adr->dump();
4056     inc_depth();
4057     print_depth(); tty->print(" %d (base) SWPointer::SWPointer: ", adr->in(AddPNode::Base)->_idx); adr->in(AddPNode::Base)->dump();
4058   }
4059 }
4060 
ctor_3(Node * adr,int i)4061 void SWPointer::Tracer::ctor_3(Node* adr, int i) {
4062   if(_slp->is_trace_alignment()) {
4063     inc_depth();
4064     Node* offset = adr->in(AddPNode::Offset);
4065     print_depth(); tty->print(" %d (offset) SWPointer::SWPointer: i = %d: ", offset->_idx, i); offset->dump();
4066   }
4067 }
4068 
ctor_4(Node * adr,int i)4069 void SWPointer::Tracer::ctor_4(Node* adr, int i) {
4070   if(_slp->is_trace_alignment()) {
4071     inc_depth();
4072     print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: i = %d: ", adr->_idx, i); adr->dump();
4073   }
4074 }
4075 
ctor_5(Node * adr,Node * base,int i)4076 void SWPointer::Tracer::ctor_5(Node* adr, Node* base, int i) {
4077   if(_slp->is_trace_alignment()) {
4078     inc_depth();
4079     if (base == adr) {
4080       print_depth(); tty->print_cr("  \\ %d (adr) == %d (base) SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, base->_idx, i);
4081     } else if (!adr->is_AddP()) {
4082       print_depth(); tty->print_cr("  \\ %d (adr) is NOT Addp SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, i);
4083     }
4084   }
4085 }
4086 
ctor_6(Node * mem)4087 void SWPointer::Tracer::ctor_6(Node* mem) {
4088   if(_slp->is_trace_alignment()) {
4089     //restore_depth();
4090     print_depth(); tty->print_cr(" %d (adr) SWPointer::SWPointer: stop analysis", mem->_idx);
4091   }
4092 }
4093 
invariant_1(Node * n,Node * n_c) const4094 void SWPointer::Tracer::invariant_1(Node *n, Node *n_c) const {
4095   if (_slp->do_vector_loop() && _slp->is_debug() && _slp->_lpt->is_member(_slp->_phase->get_loop(n_c)) != (int)_slp->in_bb(n)) {
4096     int is_member =  _slp->_lpt->is_member(_slp->_phase->get_loop(n_c));
4097     int in_bb     =  _slp->in_bb(n);
4098     print_depth(); tty->print("  \\ ");  tty->print_cr(" %d SWPointer::invariant  conditions differ: n_c %d", n->_idx, n_c->_idx);
4099     print_depth(); tty->print("  \\ ");  tty->print_cr("is_member %d, in_bb %d", is_member, in_bb);
4100     print_depth(); tty->print("  \\ ");  n->dump();
4101     print_depth(); tty->print("  \\ ");  n_c->dump();
4102   }
4103 }
4104 
scaled_iv_plus_offset_1(Node * n)4105 void SWPointer::Tracer::scaled_iv_plus_offset_1(Node* n) {
4106   if(_slp->is_trace_alignment()) {
4107     print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset testing node: ", n->_idx);
4108     n->dump();
4109   }
4110 }
4111 
scaled_iv_plus_offset_2(Node * n)4112 void SWPointer::Tracer::scaled_iv_plus_offset_2(Node* n) {
4113   if(_slp->is_trace_alignment()) {
4114     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
4115   }
4116 }
4117 
scaled_iv_plus_offset_3(Node * n)4118 void SWPointer::Tracer::scaled_iv_plus_offset_3(Node* n) {
4119   if(_slp->is_trace_alignment()) {
4120     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
4121   }
4122 }
4123 
scaled_iv_plus_offset_4(Node * n)4124 void SWPointer::Tracer::scaled_iv_plus_offset_4(Node* n) {
4125   if(_slp->is_trace_alignment()) {
4126     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
4127     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
4128     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
4129   }
4130 }
4131 
scaled_iv_plus_offset_5(Node * n)4132 void SWPointer::Tracer::scaled_iv_plus_offset_5(Node* n) {
4133   if(_slp->is_trace_alignment()) {
4134     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
4135     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
4136     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
4137   }
4138 }
4139 
scaled_iv_plus_offset_6(Node * n)4140 void SWPointer::Tracer::scaled_iv_plus_offset_6(Node* n) {
4141   if(_slp->is_trace_alignment()) {
4142     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
4143     print_depth(); tty->print("  \\  %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
4144     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
4145   }
4146 }
4147 
scaled_iv_plus_offset_7(Node * n)4148 void SWPointer::Tracer::scaled_iv_plus_offset_7(Node* n) {
4149   if(_slp->is_trace_alignment()) {
4150     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
4151     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
4152     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
4153   }
4154 }
4155 
scaled_iv_plus_offset_8(Node * n)4156 void SWPointer::Tracer::scaled_iv_plus_offset_8(Node* n) {
4157   if(_slp->is_trace_alignment()) {
4158     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: FAILED", n->_idx);
4159   }
4160 }
4161 
scaled_iv_1(Node * n)4162 void SWPointer::Tracer::scaled_iv_1(Node* n) {
4163   if(_slp->is_trace_alignment()) {
4164     print_depth(); tty->print(" %d SWPointer::scaled_iv: testing node: ", n->_idx); n->dump();
4165   }
4166 }
4167 
scaled_iv_2(Node * n,int scale)4168 void SWPointer::Tracer::scaled_iv_2(Node* n, int scale) {
4169   if(_slp->is_trace_alignment()) {
4170     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED since another _scale has been detected before", n->_idx);
4171     print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: _scale (%d) != 0", scale);
4172   }
4173 }
4174 
scaled_iv_3(Node * n,int scale)4175 void SWPointer::Tracer::scaled_iv_3(Node* n, int scale) {
4176   if(_slp->is_trace_alignment()) {
4177     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: is iv, setting _scale = %d", n->_idx, scale);
4178   }
4179 }
4180 
scaled_iv_4(Node * n,int scale)4181 void SWPointer::Tracer::scaled_iv_4(Node* n, int scale) {
4182   if(_slp->is_trace_alignment()) {
4183     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
4184     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
4185     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
4186   }
4187 }
4188 
scaled_iv_5(Node * n,int scale)4189 void SWPointer::Tracer::scaled_iv_5(Node* n, int scale) {
4190   if(_slp->is_trace_alignment()) {
4191     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
4192     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is iv: ", n->in(2)->_idx); n->in(2)->dump();
4193     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
4194   }
4195 }
4196 
scaled_iv_6(Node * n,int scale)4197 void SWPointer::Tracer::scaled_iv_6(Node* n, int scale) {
4198   if(_slp->is_trace_alignment()) {
4199     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftI PASSED, setting _scale = %d", n->_idx, scale);
4200     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
4201     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
4202   }
4203 }
4204 
scaled_iv_7(Node * n)4205 void SWPointer::Tracer::scaled_iv_7(Node* n) {
4206   if(_slp->is_trace_alignment()) {
4207     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_ConvI2L PASSED", n->_idx);
4208     print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset: ", n->in(1)->_idx);
4209     inc_depth(); inc_depth();
4210     print_depth(); n->in(1)->dump();
4211     dec_depth(); dec_depth();
4212   }
4213 }
4214 
scaled_iv_8(Node * n,SWPointer * tmp)4215 void SWPointer::Tracer::scaled_iv_8(Node* n, SWPointer* tmp) {
4216   if(_slp->is_trace_alignment()) {
4217     print_depth(); tty->print(" %d SWPointer::scaled_iv: Op_LShiftL, creating tmp SWPointer: ", n->_idx); tmp->print();
4218   }
4219 }
4220 
scaled_iv_9(Node * n,int scale,int offset,Node * invar,bool negate_invar)4221 void SWPointer::Tracer::scaled_iv_9(Node* n, int scale, int offset, Node* invar, bool negate_invar) {
4222   if(_slp->is_trace_alignment()) {
4223     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftL PASSED, setting _scale = %d, _offset = %d", n->_idx, scale, offset);
4224     print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: in(1) [%d] is scaled_iv_plus_offset, in(2) [%d] used to scale: _scale = %d, _offset = %d",
4225     n->in(1)->_idx, n->in(2)->_idx, scale, offset);
4226     if (invar != NULL) {
4227       print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: scaled invariant: %c[%d]", (negate_invar?'-':'+'), invar->_idx);
4228     }
4229     inc_depth(); inc_depth();
4230     print_depth(); n->in(1)->dump();
4231     print_depth(); n->in(2)->dump();
4232     if (invar != NULL) {
4233       print_depth(); invar->dump();
4234     }
4235     dec_depth(); dec_depth();
4236   }
4237 }
4238 
scaled_iv_10(Node * n)4239 void SWPointer::Tracer::scaled_iv_10(Node* n) {
4240   if(_slp->is_trace_alignment()) {
4241     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED", n->_idx);
4242   }
4243 }
4244 
offset_plus_k_1(Node * n)4245 void SWPointer::Tracer::offset_plus_k_1(Node* n) {
4246   if(_slp->is_trace_alignment()) {
4247     print_depth(); tty->print(" %d SWPointer::offset_plus_k: testing node: ", n->_idx); n->dump();
4248   }
4249 }
4250 
offset_plus_k_2(Node * n,int _offset)4251 void SWPointer::Tracer::offset_plus_k_2(Node* n, int _offset) {
4252   if(_slp->is_trace_alignment()) {
4253     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConI PASSED, setting _offset = %d", n->_idx, _offset);
4254   }
4255 }
4256 
offset_plus_k_3(Node * n,int _offset)4257 void SWPointer::Tracer::offset_plus_k_3(Node* n, int _offset) {
4258   if(_slp->is_trace_alignment()) {
4259     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConL PASSED, setting _offset = %d", n->_idx, _offset);
4260   }
4261 }
4262 
offset_plus_k_4(Node * n)4263 void SWPointer::Tracer::offset_plus_k_4(Node* n) {
4264   if(_slp->is_trace_alignment()) {
4265     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
4266     print_depth(); tty->print_cr("  \\ " JLONG_FORMAT " SWPointer::offset_plus_k: Op_ConL FAILED, k is too big", n->get_long());
4267   }
4268 }
4269 
offset_plus_k_5(Node * n,Node * _invar)4270 void SWPointer::Tracer::offset_plus_k_5(Node* n, Node* _invar) {
4271   if(_slp->is_trace_alignment()) {
4272     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED since another invariant has been detected before", n->_idx);
4273     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: _invar != NULL: ", _invar->_idx); _invar->dump();
4274   }
4275 }
4276 
offset_plus_k_6(Node * n,Node * _invar,bool _negate_invar,int _offset)4277 void SWPointer::Tracer::offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4278   if(_slp->is_trace_alignment()) {
4279     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
4280     n->_idx, _negate_invar, _invar->_idx, _offset);
4281     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
4282     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
4283   }
4284 }
4285 
offset_plus_k_7(Node * n,Node * _invar,bool _negate_invar,int _offset)4286 void SWPointer::Tracer::offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4287   if(_slp->is_trace_alignment()) {
4288     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
4289     n->_idx, _negate_invar, _invar->_idx, _offset);
4290     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
4291     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
4292   }
4293 }
4294 
offset_plus_k_8(Node * n,Node * _invar,bool _negate_invar,int _offset)4295 void SWPointer::Tracer::offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4296   if(_slp->is_trace_alignment()) {
4297     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI is PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
4298     n->_idx, _negate_invar, _invar->_idx, _offset);
4299     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
4300     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
4301   }
4302 }
4303 
offset_plus_k_9(Node * n,Node * _invar,bool _negate_invar,int _offset)4304 void SWPointer::Tracer::offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4305   if(_slp->is_trace_alignment()) {
4306     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
4307     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
4308     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
4309   }
4310 }
4311 
offset_plus_k_10(Node * n,Node * _invar,bool _negate_invar,int _offset)4312 void SWPointer::Tracer::offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4313   if(_slp->is_trace_alignment()) {
4314     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
4315     print_depth(); tty->print_cr("  \\ %d SWPointer::offset_plus_k: is invariant", n->_idx);
4316   }
4317 }
4318 
offset_plus_k_11(Node * n)4319 void SWPointer::Tracer::offset_plus_k_11(Node* n) {
4320   if(_slp->is_trace_alignment()) {
4321     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
4322   }
4323 }
4324 
4325 #endif
4326 // ========================= OrderedPair =====================
4327 
4328 const OrderedPair OrderedPair::initial;
4329 
4330 // ========================= SWNodeInfo =====================
4331 
4332 const SWNodeInfo SWNodeInfo::initial;
4333 
4334 
4335 // ============================ DepGraph ===========================
4336 
4337 //------------------------------make_node---------------------------
4338 // Make a new dependence graph node for an ideal node.
make_node(Node * node)4339 DepMem* DepGraph::make_node(Node* node) {
4340   DepMem* m = new (_arena) DepMem(node);
4341   if (node != NULL) {
4342     assert(_map.at_grow(node->_idx) == NULL, "one init only");
4343     _map.at_put_grow(node->_idx, m);
4344   }
4345   return m;
4346 }
4347 
4348 //------------------------------make_edge---------------------------
4349 // Make a new dependence graph edge from dpred -> dsucc
make_edge(DepMem * dpred,DepMem * dsucc)4350 DepEdge* DepGraph::make_edge(DepMem* dpred, DepMem* dsucc) {
4351   DepEdge* e = new (_arena) DepEdge(dpred, dsucc, dsucc->in_head(), dpred->out_head());
4352   dpred->set_out_head(e);
4353   dsucc->set_in_head(e);
4354   return e;
4355 }
4356 
4357 // ========================== DepMem ========================
4358 
4359 //------------------------------in_cnt---------------------------
in_cnt()4360 int DepMem::in_cnt() {
4361   int ct = 0;
4362   for (DepEdge* e = _in_head; e != NULL; e = e->next_in()) ct++;
4363   return ct;
4364 }
4365 
4366 //------------------------------out_cnt---------------------------
out_cnt()4367 int DepMem::out_cnt() {
4368   int ct = 0;
4369   for (DepEdge* e = _out_head; e != NULL; e = e->next_out()) ct++;
4370   return ct;
4371 }
4372 
4373 //------------------------------print-----------------------------
print()4374 void DepMem::print() {
4375 #ifndef PRODUCT
4376   tty->print("  DepNode %d (", _node->_idx);
4377   for (DepEdge* p = _in_head; p != NULL; p = p->next_in()) {
4378     Node* pred = p->pred()->node();
4379     tty->print(" %d", pred != NULL ? pred->_idx : 0);
4380   }
4381   tty->print(") [");
4382   for (DepEdge* s = _out_head; s != NULL; s = s->next_out()) {
4383     Node* succ = s->succ()->node();
4384     tty->print(" %d", succ != NULL ? succ->_idx : 0);
4385   }
4386   tty->print_cr(" ]");
4387 #endif
4388 }
4389 
4390 // =========================== DepEdge =========================
4391 
4392 //------------------------------DepPreds---------------------------
print()4393 void DepEdge::print() {
4394 #ifndef PRODUCT
4395   tty->print_cr("DepEdge: %d [ %d ]", _pred->node()->_idx, _succ->node()->_idx);
4396 #endif
4397 }
4398 
4399 // =========================== DepPreds =========================
4400 // Iterator over predecessor edges in the dependence graph.
4401 
4402 //------------------------------DepPreds---------------------------
DepPreds(Node * n,DepGraph & dg)4403 DepPreds::DepPreds(Node* n, DepGraph& dg) {
4404   _n = n;
4405   _done = false;
4406   if (_n->is_Store() || _n->is_Load()) {
4407     _next_idx = MemNode::Address;
4408     _end_idx  = n->req();
4409     _dep_next = dg.dep(_n)->in_head();
4410   } else if (_n->is_Mem()) {
4411     _next_idx = 0;
4412     _end_idx  = 0;
4413     _dep_next = dg.dep(_n)->in_head();
4414   } else {
4415     _next_idx = 1;
4416     _end_idx  = _n->req();
4417     _dep_next = NULL;
4418   }
4419   next();
4420 }
4421 
4422 //------------------------------next---------------------------
next()4423 void DepPreds::next() {
4424   if (_dep_next != NULL) {
4425     _current  = _dep_next->pred()->node();
4426     _dep_next = _dep_next->next_in();
4427   } else if (_next_idx < _end_idx) {
4428     _current  = _n->in(_next_idx++);
4429   } else {
4430     _done = true;
4431   }
4432 }
4433 
4434 // =========================== DepSuccs =========================
4435 // Iterator over successor edges in the dependence graph.
4436 
4437 //------------------------------DepSuccs---------------------------
DepSuccs(Node * n,DepGraph & dg)4438 DepSuccs::DepSuccs(Node* n, DepGraph& dg) {
4439   _n = n;
4440   _done = false;
4441   if (_n->is_Load()) {
4442     _next_idx = 0;
4443     _end_idx  = _n->outcnt();
4444     _dep_next = dg.dep(_n)->out_head();
4445   } else if (_n->is_Mem() || (_n->is_Phi() && _n->bottom_type() == Type::MEMORY)) {
4446     _next_idx = 0;
4447     _end_idx  = 0;
4448     _dep_next = dg.dep(_n)->out_head();
4449   } else {
4450     _next_idx = 0;
4451     _end_idx  = _n->outcnt();
4452     _dep_next = NULL;
4453   }
4454   next();
4455 }
4456 
4457 //-------------------------------next---------------------------
next()4458 void DepSuccs::next() {
4459   if (_dep_next != NULL) {
4460     _current  = _dep_next->succ()->node();
4461     _dep_next = _dep_next->next_out();
4462   } else if (_next_idx < _end_idx) {
4463     _current  = _n->raw_out(_next_idx++);
4464   } else {
4465     _done = true;
4466   }
4467 }
4468 
4469 //
4470 // --------------------------------- vectorization/simd -----------------------------------
4471 //
same_origin_idx(Node * a,Node * b) const4472 bool SuperWord::same_origin_idx(Node* a, Node* b) const {
4473   return a != NULL && b != NULL && _clone_map.same_idx(a->_idx, b->_idx);
4474 }
same_generation(Node * a,Node * b) const4475 bool SuperWord::same_generation(Node* a, Node* b) const {
4476   return a != NULL && b != NULL && _clone_map.same_gen(a->_idx, b->_idx);
4477 }
4478 
find_phi_for_mem_dep(LoadNode * ld)4479 Node*  SuperWord::find_phi_for_mem_dep(LoadNode* ld) {
4480   assert(in_bb(ld), "must be in block");
4481   if (_clone_map.gen(ld->_idx) == _ii_first) {
4482 #ifndef PRODUCT
4483     if (_vector_loop_debug) {
4484       tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(ld->_idx)=%d",
4485         _clone_map.gen(ld->_idx));
4486     }
4487 #endif
4488     return NULL; //we think that any ld in the first gen being vectorizable
4489   }
4490 
4491   Node* mem = ld->in(MemNode::Memory);
4492   if (mem->outcnt() <= 1) {
4493     // we don't want to remove the only edge from mem node to load
4494 #ifndef PRODUCT
4495     if (_vector_loop_debug) {
4496       tty->print_cr("SuperWord::find_phi_for_mem_dep input node %d to load %d has no other outputs and edge mem->load cannot be removed",
4497         mem->_idx, ld->_idx);
4498       ld->dump();
4499       mem->dump();
4500     }
4501 #endif
4502     return NULL;
4503   }
4504   if (!in_bb(mem) || same_generation(mem, ld)) {
4505 #ifndef PRODUCT
4506     if (_vector_loop_debug) {
4507       tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(mem->_idx)=%d",
4508         _clone_map.gen(mem->_idx));
4509     }
4510 #endif
4511     return NULL; // does not depend on loop volatile node or depends on the same generation
4512   }
4513 
4514   //otherwise first node should depend on mem-phi
4515   Node* first = first_node(ld);
4516   assert(first->is_Load(), "must be Load");
4517   Node* phi = first->as_Load()->in(MemNode::Memory);
4518   if (!phi->is_Phi() || phi->bottom_type() != Type::MEMORY) {
4519 #ifndef PRODUCT
4520     if (_vector_loop_debug) {
4521       tty->print_cr("SuperWord::find_phi_for_mem_dep load is not vectorizable node, since it's `first` does not take input from mem phi");
4522       ld->dump();
4523       first->dump();
4524     }
4525 #endif
4526     return NULL;
4527   }
4528 
4529   Node* tail = 0;
4530   for (int m = 0; m < _mem_slice_head.length(); m++) {
4531     if (_mem_slice_head.at(m) == phi) {
4532       tail = _mem_slice_tail.at(m);
4533     }
4534   }
4535   if (tail == 0) { //test that found phi is in the list  _mem_slice_head
4536 #ifndef PRODUCT
4537     if (_vector_loop_debug) {
4538       tty->print_cr("SuperWord::find_phi_for_mem_dep load %d is not vectorizable node, its phi %d is not _mem_slice_head",
4539         ld->_idx, phi->_idx);
4540       ld->dump();
4541       phi->dump();
4542     }
4543 #endif
4544     return NULL;
4545   }
4546 
4547   // now all conditions are met
4548   return phi;
4549 }
4550 
first_node(Node * nd)4551 Node* SuperWord::first_node(Node* nd) {
4552   for (int ii = 0; ii < _iteration_first.length(); ii++) {
4553     Node* nnn = _iteration_first.at(ii);
4554     if (same_origin_idx(nnn, nd)) {
4555 #ifndef PRODUCT
4556       if (_vector_loop_debug) {
4557         tty->print_cr("SuperWord::first_node: %d is the first iteration node for %d (_clone_map.idx(nnn->_idx) = %d)",
4558           nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx));
4559       }
4560 #endif
4561       return nnn;
4562     }
4563   }
4564 
4565 #ifndef PRODUCT
4566   if (_vector_loop_debug) {
4567     tty->print_cr("SuperWord::first_node: did not find first iteration node for %d (_clone_map.idx(nd->_idx)=%d)",
4568       nd->_idx, _clone_map.idx(nd->_idx));
4569   }
4570 #endif
4571   return 0;
4572 }
4573 
last_node(Node * nd)4574 Node* SuperWord::last_node(Node* nd) {
4575   for (int ii = 0; ii < _iteration_last.length(); ii++) {
4576     Node* nnn = _iteration_last.at(ii);
4577     if (same_origin_idx(nnn, nd)) {
4578 #ifndef PRODUCT
4579       if (_vector_loop_debug) {
4580         tty->print_cr("SuperWord::last_node _clone_map.idx(nnn->_idx)=%d, _clone_map.idx(nd->_idx)=%d",
4581           _clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx));
4582       }
4583 #endif
4584       return nnn;
4585     }
4586   }
4587   return 0;
4588 }
4589 
mark_generations()4590 int SuperWord::mark_generations() {
4591   Node *ii_err = NULL, *tail_err = NULL;
4592   for (int i = 0; i < _mem_slice_head.length(); i++) {
4593     Node* phi  = _mem_slice_head.at(i);
4594     assert(phi->is_Phi(), "must be phi");
4595 
4596     Node* tail = _mem_slice_tail.at(i);
4597     if (_ii_last == -1) {
4598       tail_err = tail;
4599       _ii_last = _clone_map.gen(tail->_idx);
4600     }
4601     else if (_ii_last != _clone_map.gen(tail->_idx)) {
4602 #ifndef PRODUCT
4603       if (TraceSuperWord && Verbose) {
4604         tty->print_cr("SuperWord::mark_generations _ii_last error - found different generations in two tail nodes ");
4605         tail->dump();
4606         tail_err->dump();
4607       }
4608 #endif
4609       return -1;
4610     }
4611 
4612     // find first iteration in the loop
4613     for (DUIterator_Fast imax, i = phi->fast_outs(imax); i < imax; i++) {
4614       Node* ii = phi->fast_out(i);
4615       if (in_bb(ii) && ii->is_Store()) { // we speculate that normally Stores of one and one only generation have deps from mem phi
4616         if (_ii_first == -1) {
4617           ii_err = ii;
4618           _ii_first = _clone_map.gen(ii->_idx);
4619         } else if (_ii_first != _clone_map.gen(ii->_idx)) {
4620 #ifndef PRODUCT
4621           if (TraceSuperWord && Verbose) {
4622             tty->print_cr("SuperWord::mark_generations: _ii_first was found before and not equal to one in this node (%d)", _ii_first);
4623             ii->dump();
4624             if (ii_err!= 0) {
4625               ii_err->dump();
4626             }
4627           }
4628 #endif
4629           return -1; // this phi has Stores from different generations of unroll and cannot be simd/vectorized
4630         }
4631       }
4632     }//for (DUIterator_Fast imax,
4633   }//for (int i...
4634 
4635   if (_ii_first == -1 || _ii_last == -1) {
4636     if (TraceSuperWord && Verbose) {
4637       tty->print_cr("SuperWord::mark_generations unknown error, something vent wrong");
4638     }
4639     return -1; // something vent wrong
4640   }
4641   // collect nodes in the first and last generations
4642   assert(_iteration_first.length() == 0, "_iteration_first must be empty");
4643   assert(_iteration_last.length() == 0, "_iteration_last must be empty");
4644   for (int j = 0; j < _block.length(); j++) {
4645     Node* n = _block.at(j);
4646     node_idx_t gen = _clone_map.gen(n->_idx);
4647     if ((signed)gen == _ii_first) {
4648       _iteration_first.push(n);
4649     } else if ((signed)gen == _ii_last) {
4650       _iteration_last.push(n);
4651     }
4652   }
4653 
4654   // building order of iterations
4655   if (_ii_order.length() == 0 && ii_err != 0) {
4656     assert(in_bb(ii_err) && ii_err->is_Store(), "should be Store in bb");
4657     Node* nd = ii_err;
4658     while(_clone_map.gen(nd->_idx) != _ii_last) {
4659       _ii_order.push(_clone_map.gen(nd->_idx));
4660       bool found = false;
4661       for (DUIterator_Fast imax, i = nd->fast_outs(imax); i < imax; i++) {
4662         Node* use = nd->fast_out(i);
4663         if (same_origin_idx(use, nd) && use->as_Store()->in(MemNode::Memory) == nd) {
4664           found = true;
4665           nd = use;
4666           break;
4667         }
4668       }//for
4669 
4670       if (found == false) {
4671         if (TraceSuperWord && Verbose) {
4672           tty->print_cr("SuperWord::mark_generations: Cannot build order of iterations - no dependent Store for %d", nd->_idx);
4673         }
4674         _ii_order.clear();
4675         return -1;
4676       }
4677     } //while
4678     _ii_order.push(_clone_map.gen(nd->_idx));
4679   }
4680 
4681 #ifndef PRODUCT
4682   if (_vector_loop_debug) {
4683     tty->print_cr("SuperWord::mark_generations");
4684     tty->print_cr("First generation (%d) nodes:", _ii_first);
4685     for (int ii = 0; ii < _iteration_first.length(); ii++)  _iteration_first.at(ii)->dump();
4686     tty->print_cr("Last generation (%d) nodes:", _ii_last);
4687     for (int ii = 0; ii < _iteration_last.length(); ii++)  _iteration_last.at(ii)->dump();
4688     tty->print_cr(" ");
4689 
4690     tty->print("SuperWord::List of generations: ");
4691     for (int jj = 0; jj < _ii_order.length(); ++jj) {
4692       tty->print("%d:%d ", jj, _ii_order.at(jj));
4693     }
4694     tty->print_cr(" ");
4695   }
4696 #endif
4697 
4698   return _ii_first;
4699 }
4700 
fix_commutative_inputs(Node * gold,Node * fix)4701 bool SuperWord::fix_commutative_inputs(Node* gold, Node* fix) {
4702   assert(gold->is_Add() && fix->is_Add() || gold->is_Mul() && fix->is_Mul(), "should be only Add or Mul nodes");
4703   assert(same_origin_idx(gold, fix), "should be clones of the same node");
4704   Node* gin1 = gold->in(1);
4705   Node* gin2 = gold->in(2);
4706   Node* fin1 = fix->in(1);
4707   Node* fin2 = fix->in(2);
4708   bool swapped = false;
4709 
4710   if (in_bb(gin1) && in_bb(gin2) && in_bb(fin1) && in_bb(fin1)) {
4711     if (same_origin_idx(gin1, fin1) &&
4712         same_origin_idx(gin2, fin2)) {
4713       return true; // nothing to fix
4714     }
4715     if (same_origin_idx(gin1, fin2) &&
4716         same_origin_idx(gin2, fin1)) {
4717       fix->swap_edges(1, 2);
4718       swapped = true;
4719     }
4720   }
4721   // at least one input comes from outside of bb
4722   if (gin1->_idx == fin1->_idx)  {
4723     return true; // nothing to fix
4724   }
4725   if (!swapped && (gin1->_idx == fin2->_idx || gin2->_idx == fin1->_idx))  { //swapping is expensive, check condition first
4726     fix->swap_edges(1, 2);
4727     swapped = true;
4728   }
4729 
4730   if (swapped) {
4731 #ifndef PRODUCT
4732     if (_vector_loop_debug) {
4733       tty->print_cr("SuperWord::fix_commutative_inputs: fixed node %d", fix->_idx);
4734     }
4735 #endif
4736     return true;
4737   }
4738 
4739   if (TraceSuperWord && Verbose) {
4740     tty->print_cr("SuperWord::fix_commutative_inputs: cannot fix node %d", fix->_idx);
4741   }
4742 
4743   return false;
4744 }
4745 
pack_parallel()4746 bool SuperWord::pack_parallel() {
4747 #ifndef PRODUCT
4748   if (_vector_loop_debug) {
4749     tty->print_cr("SuperWord::pack_parallel: START");
4750   }
4751 #endif
4752 
4753   _packset.clear();
4754 
4755   if (_ii_order.is_empty()) {
4756 #ifndef PRODUCT
4757     if (_vector_loop_debug) {
4758       tty->print_cr("SuperWord::pack_parallel: EMPTY");
4759     }
4760 #endif
4761     return false;
4762   }
4763 
4764   for (int ii = 0; ii < _iteration_first.length(); ii++) {
4765     Node* nd = _iteration_first.at(ii);
4766     if (in_bb(nd) && (nd->is_Load() || nd->is_Store() || nd->is_Add() || nd->is_Mul())) {
4767       Node_List* pk = new Node_List();
4768       pk->push(nd);
4769       for (int gen = 1; gen < _ii_order.length(); ++gen) {
4770         for (int kk = 0; kk < _block.length(); kk++) {
4771           Node* clone = _block.at(kk);
4772           if (same_origin_idx(clone, nd) &&
4773               _clone_map.gen(clone->_idx) == _ii_order.at(gen)) {
4774             if (nd->is_Add() || nd->is_Mul()) {
4775               fix_commutative_inputs(nd, clone);
4776             }
4777             pk->push(clone);
4778             if (pk->size() == 4) {
4779               _packset.append(pk);
4780 #ifndef PRODUCT
4781               if (_vector_loop_debug) {
4782                 tty->print_cr("SuperWord::pack_parallel: added pack ");
4783                 pk->dump();
4784               }
4785 #endif
4786               if (_clone_map.gen(clone->_idx) != _ii_last) {
4787                 pk = new Node_List();
4788               }
4789             }
4790             break;
4791           }
4792         }
4793       }//for
4794     }//if
4795   }//for
4796 
4797 #ifndef PRODUCT
4798   if (_vector_loop_debug) {
4799     tty->print_cr("SuperWord::pack_parallel: END");
4800   }
4801 #endif
4802 
4803   return true;
4804 }
4805 
hoist_loads_in_graph()4806 bool SuperWord::hoist_loads_in_graph() {
4807   GrowableArray<Node*> loads;
4808 
4809 #ifndef PRODUCT
4810   if (_vector_loop_debug) {
4811     tty->print_cr("SuperWord::hoist_loads_in_graph: total number _mem_slice_head.length() = %d", _mem_slice_head.length());
4812   }
4813 #endif
4814 
4815   for (int i = 0; i < _mem_slice_head.length(); i++) {
4816     Node* n = _mem_slice_head.at(i);
4817     if ( !in_bb(n) || !n->is_Phi() || n->bottom_type() != Type::MEMORY) {
4818       if (TraceSuperWord && Verbose) {
4819         tty->print_cr("SuperWord::hoist_loads_in_graph: skipping unexpected node n=%d", n->_idx);
4820       }
4821       continue;
4822     }
4823 
4824 #ifndef PRODUCT
4825     if (_vector_loop_debug) {
4826       tty->print_cr("SuperWord::hoist_loads_in_graph: processing phi %d  = _mem_slice_head.at(%d);", n->_idx, i);
4827     }
4828 #endif
4829 
4830     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
4831       Node* ld = n->fast_out(i);
4832       if (ld->is_Load() && ld->as_Load()->in(MemNode::Memory) == n && in_bb(ld)) {
4833         for (int i = 0; i < _block.length(); i++) {
4834           Node* ld2 = _block.at(i);
4835           if (ld2->is_Load() && same_origin_idx(ld, ld2) &&
4836               !same_generation(ld, ld2)) { // <= do not collect the first generation ld
4837 #ifndef PRODUCT
4838             if (_vector_loop_debug) {
4839               tty->print_cr("SuperWord::hoist_loads_in_graph: will try to hoist load ld2->_idx=%d, cloned from %d (ld->_idx=%d)",
4840                 ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx);
4841             }
4842 #endif
4843             // could not do on-the-fly, since iterator is immutable
4844             loads.push(ld2);
4845           }
4846         }// for
4847       }//if
4848     }//for (DUIterator_Fast imax,
4849   }//for (int i = 0; i
4850 
4851   for (int i = 0; i < loads.length(); i++) {
4852     LoadNode* ld = loads.at(i)->as_Load();
4853     Node* phi = find_phi_for_mem_dep(ld);
4854     if (phi != NULL) {
4855 #ifndef PRODUCT
4856       if (_vector_loop_debug) {
4857         tty->print_cr("SuperWord::hoist_loads_in_graph replacing MemNode::Memory(%d) edge in %d with one from %d",
4858           MemNode::Memory, ld->_idx, phi->_idx);
4859       }
4860 #endif
4861       _igvn.replace_input_of(ld, MemNode::Memory, phi);
4862     }
4863   }//for
4864 
4865   restart(); // invalidate all basic structures, since we rebuilt the graph
4866 
4867   if (TraceSuperWord && Verbose) {
4868     tty->print_cr("\nSuperWord::hoist_loads_in_graph() the graph was rebuilt, all structures invalidated and need rebuild");
4869   }
4870 
4871   return true;
4872 }
4873