1 /*
2 * Copyright (c) 2007, 2020, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 */
23
24 #include "precompiled.hpp"
25 #include "compiler/compileLog.hpp"
26 #include "libadt/vectset.hpp"
27 #include "memory/allocation.inline.hpp"
28 #include "memory/resourceArea.hpp"
29 #include "opto/addnode.hpp"
30 #include "opto/callnode.hpp"
31 #include "opto/castnode.hpp"
32 #include "opto/convertnode.hpp"
33 #include "opto/divnode.hpp"
34 #include "opto/matcher.hpp"
35 #include "opto/memnode.hpp"
36 #include "opto/mulnode.hpp"
37 #include "opto/opcodes.hpp"
38 #include "opto/opaquenode.hpp"
39 #include "opto/superword.hpp"
40 #include "opto/vectornode.hpp"
41 #include "opto/movenode.hpp"
42 #include "utilities/powerOfTwo.hpp"
43
44 //
45 // S U P E R W O R D T R A N S F O R M
46 //=============================================================================
47
48 //------------------------------SuperWord---------------------------
SuperWord(PhaseIdealLoop * phase)49 SuperWord::SuperWord(PhaseIdealLoop* phase) :
50 _phase(phase),
51 _arena(phase->C->comp_arena()),
52 _igvn(phase->_igvn),
53 _packset(arena(), 8, 0, NULL), // packs for the current block
54 _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
55 _block(arena(), 8, 0, NULL), // nodes in current block
56 _post_block(arena(), 8, 0, NULL), // nodes common to current block which are marked as post loop vectorizable
57 _data_entry(arena(), 8, 0, NULL), // nodes with all inputs from outside
58 _mem_slice_head(arena(), 8, 0, NULL), // memory slice heads
59 _mem_slice_tail(arena(), 8, 0, NULL), // memory slice tails
60 _node_info(arena(), 8, 0, SWNodeInfo::initial), // info needed per node
61 _clone_map(phase->C->clone_map()), // map of nodes created in cloning
62 _cmovev_kit(_arena, this), // map to facilitate CMoveV creation
63 _align_to_ref(NULL), // memory reference to align vectors to
64 _disjoint_ptrs(arena(), 8, 0, OrderedPair::initial), // runtime disambiguated pointer pairs
65 _dg(_arena), // dependence graph
66 _visited(arena()), // visited node set
67 _post_visited(arena()), // post visited node set
68 _n_idx_list(arena(), 8), // scratch list of (node,index) pairs
69 _nlist(arena(), 8, 0, NULL), // scratch list of nodes
70 _stk(arena(), 8, 0, NULL), // scratch stack of nodes
71 _lpt(NULL), // loop tree node
72 _lp(NULL), // CountedLoopNode
73 _pre_loop_end(NULL), // Pre loop CountedLoopEndNode
74 _bb(NULL), // basic block
75 _iv(NULL), // induction var
76 _race_possible(false), // cases where SDMU is true
77 _early_return(true), // analysis evaluations routine
78 _do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style
79 _do_reserve_copy(DoReserveCopyInSuperWord),
80 _num_work_vecs(0), // amount of vector work we have
81 _num_reductions(0), // amount of reduction work we have
82 _ii_first(-1), // first loop generation index - only if do_vector_loop()
83 _ii_last(-1), // last loop generation index - only if do_vector_loop()
84 _ii_order(arena(), 8, 0, 0)
85 {
86 #ifndef PRODUCT
87 _vector_loop_debug = 0;
88 if (_phase->C->method() != NULL) {
89 _vector_loop_debug = phase->C->directive()->VectorizeDebugOption;
90 }
91
92 #endif
93 }
94
95 static const bool _do_vector_loop_experimental = false; // Experimental vectorization which uses data from loop unrolling.
96
97 //------------------------------transform_loop---------------------------
transform_loop(IdealLoopTree * lpt,bool do_optimization)98 void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
99 assert(UseSuperWord, "should be");
100 // SuperWord only works with power of two vector sizes.
101 int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
102 if (vector_width < 2 || !is_power_of_2(vector_width)) {
103 return;
104 }
105
106 assert(lpt->_head->is_CountedLoop(), "must be");
107 CountedLoopNode *cl = lpt->_head->as_CountedLoop();
108
109 if (!cl->is_valid_counted_loop(T_INT)) return; // skip malformed counted loop
110
111 bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
112 if (post_loop_allowed) {
113 if (cl->is_reduction_loop()) return; // no predication mapping
114 Node *limit = cl->limit();
115 if (limit->is_Con()) return; // non constant limits only
116 // Now check the limit for expressions we do not handle
117 if (limit->is_Add()) {
118 Node *in2 = limit->in(2);
119 if (in2->is_Con()) {
120 int val = in2->get_int();
121 // should not try to program these cases
122 if (val < 0) return;
123 }
124 }
125 }
126
127 // skip any loop that has not been assigned max unroll by analysis
128 if (do_optimization) {
129 if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == 0) return;
130 }
131
132 // Check for no control flow in body (other than exit)
133 Node *cl_exit = cl->loopexit();
134 if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
135 #ifndef PRODUCT
136 if (TraceSuperWord) {
137 tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
138 tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
139 tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
140 tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
141 lpt->dump_head();
142 }
143 #endif
144 return;
145 }
146
147 // Make sure the are no extra control users of the loop backedge
148 if (cl->back_control()->outcnt() != 1) {
149 return;
150 }
151
152 // Skip any loops already optimized by slp
153 if (cl->is_vectorized_loop()) return;
154
155 if (cl->is_unroll_only()) return;
156
157 if (cl->is_main_loop()) {
158 // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
159 CountedLoopEndNode* pre_end = find_pre_loop_end(cl);
160 if (pre_end == NULL) {
161 return;
162 }
163 Node* pre_opaq1 = pre_end->limit();
164 if (pre_opaq1->Opcode() != Op_Opaque1) {
165 return;
166 }
167 set_pre_loop_end(pre_end);
168 }
169
170 init(); // initialize data structures
171
172 set_lpt(lpt);
173 set_lp(cl);
174
175 // For now, define one block which is the entire loop body
176 set_bb(cl);
177
178 if (do_optimization) {
179 assert(_packset.length() == 0, "packset must be empty");
180 SLP_extract();
181 if (PostLoopMultiversioning && Matcher::has_predicated_vectors()) {
182 if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
183 IdealLoopTree *lpt_next = lpt->_next;
184 CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
185 _phase->has_range_checks(lpt_next);
186 if (cl_next->is_post_loop() && !cl_next->range_checks_present()) {
187 if (!cl_next->is_vectorized_loop()) {
188 int slp_max_unroll_factor = cl->slp_max_unroll();
189 cl_next->set_slp_max_unroll(slp_max_unroll_factor);
190 }
191 }
192 }
193 }
194 }
195 }
196
197 //------------------------------early unrolling analysis------------------------------
unrolling_analysis(int & local_loop_unroll_factor)198 void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
199 bool is_slp = true;
200 ResourceMark rm;
201 size_t ignored_size = lpt()->_body.size();
202 int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
203 Node_Stack nstack((int)ignored_size);
204 CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
205 Node *cl_exit = cl->loopexit_or_null();
206 int rpo_idx = _post_block.length();
207
208 assert(rpo_idx == 0, "post loop block is empty");
209
210 // First clear the entries
211 for (uint i = 0; i < lpt()->_body.size(); i++) {
212 ignored_loop_nodes[i] = -1;
213 }
214
215 int max_vector = Matcher::max_vector_size(T_BYTE);
216 bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
217
218 // Process the loop, some/all of the stack entries will not be in order, ergo
219 // need to preprocess the ignored initial state before we process the loop
220 for (uint i = 0; i < lpt()->_body.size(); i++) {
221 Node* n = lpt()->_body.at(i);
222 if (n == cl->incr() ||
223 n->is_reduction() ||
224 n->is_AddP() ||
225 n->is_Cmp() ||
226 n->is_IfTrue() ||
227 n->is_CountedLoop() ||
228 (n == cl_exit)) {
229 ignored_loop_nodes[i] = n->_idx;
230 continue;
231 }
232
233 if (n->is_If()) {
234 IfNode *iff = n->as_If();
235 if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
236 if (lpt()->is_loop_exit(iff)) {
237 ignored_loop_nodes[i] = n->_idx;
238 continue;
239 }
240 }
241 }
242
243 if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
244 Node* n_tail = n->in(LoopNode::LoopBackControl);
245 if (n_tail != n->in(LoopNode::EntryControl)) {
246 if (!n_tail->is_Mem()) {
247 is_slp = false;
248 break;
249 }
250 }
251 }
252
253 // This must happen after check of phi/if
254 if (n->is_Phi() || n->is_If()) {
255 ignored_loop_nodes[i] = n->_idx;
256 continue;
257 }
258
259 if (n->is_LoadStore() || n->is_MergeMem() ||
260 (n->is_Proj() && !n->as_Proj()->is_CFG())) {
261 is_slp = false;
262 break;
263 }
264
265 // Ignore nodes with non-primitive type.
266 BasicType bt;
267 if (n->is_Mem()) {
268 bt = n->as_Mem()->memory_type();
269 } else {
270 bt = n->bottom_type()->basic_type();
271 }
272 if (is_java_primitive(bt) == false) {
273 ignored_loop_nodes[i] = n->_idx;
274 continue;
275 }
276
277 if (n->is_Mem()) {
278 MemNode* current = n->as_Mem();
279 Node* adr = n->in(MemNode::Address);
280 Node* n_ctrl = _phase->get_ctrl(adr);
281
282 // save a queue of post process nodes
283 if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
284 // Process the memory expression
285 int stack_idx = 0;
286 bool have_side_effects = true;
287 if (adr->is_AddP() == false) {
288 nstack.push(adr, stack_idx++);
289 } else {
290 // Mark the components of the memory operation in nstack
291 SWPointer p1(current, this, &nstack, true);
292 have_side_effects = p1.node_stack()->is_nonempty();
293 }
294
295 // Process the pointer stack
296 while (have_side_effects) {
297 Node* pointer_node = nstack.node();
298 for (uint j = 0; j < lpt()->_body.size(); j++) {
299 Node* cur_node = lpt()->_body.at(j);
300 if (cur_node == pointer_node) {
301 ignored_loop_nodes[j] = cur_node->_idx;
302 break;
303 }
304 }
305 nstack.pop();
306 have_side_effects = nstack.is_nonempty();
307 }
308 }
309 }
310 }
311
312 if (is_slp) {
313 // Now we try to find the maximum supported consistent vector which the machine
314 // description can use
315 bool small_basic_type = false;
316 bool flag_small_bt = false;
317 for (uint i = 0; i < lpt()->_body.size(); i++) {
318 if (ignored_loop_nodes[i] != -1) continue;
319
320 BasicType bt;
321 Node* n = lpt()->_body.at(i);
322 if (n->is_Mem()) {
323 bt = n->as_Mem()->memory_type();
324 } else {
325 bt = n->bottom_type()->basic_type();
326 }
327
328 if (post_loop_allowed) {
329 if (!small_basic_type) {
330 switch (bt) {
331 case T_CHAR:
332 case T_BYTE:
333 case T_SHORT:
334 small_basic_type = true;
335 break;
336
337 case T_LONG:
338 // TODO: Remove when support completed for mask context with LONG.
339 // Support needs to be augmented for logical qword operations, currently we map to dword
340 // buckets for vectors on logicals as these were legacy.
341 small_basic_type = true;
342 break;
343
344 default:
345 break;
346 }
347 }
348 }
349
350 if (is_java_primitive(bt) == false) continue;
351
352 int cur_max_vector = Matcher::max_vector_size(bt);
353
354 // If a max vector exists which is not larger than _local_loop_unroll_factor
355 // stop looking, we already have the max vector to map to.
356 if (cur_max_vector < local_loop_unroll_factor) {
357 is_slp = false;
358 if (TraceSuperWordLoopUnrollAnalysis) {
359 tty->print_cr("slp analysis fails: unroll limit greater than max vector\n");
360 }
361 break;
362 }
363
364 // Map the maximal common vector
365 if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
366 if (cur_max_vector < max_vector && !flag_small_bt) {
367 max_vector = cur_max_vector;
368 } else if (cur_max_vector > max_vector && UseSubwordForMaxVector) {
369 // Analyse subword in the loop to set maximum vector size to take advantage of full vector width for subword types.
370 // Here we analyze if narrowing is likely to happen and if it is we set vector size more aggressively.
371 // We check for possibility of narrowing by looking through chain operations using subword types.
372 if (is_subword_type(bt)) {
373 uint start, end;
374 VectorNode::vector_operands(n, &start, &end);
375
376 for (uint j = start; j < end; j++) {
377 Node* in = n->in(j);
378 // Don't propagate through a memory
379 if (!in->is_Mem() && in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
380 bool same_type = true;
381 for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
382 Node *use = in->fast_out(k);
383 if (!in_bb(use) && use->bottom_type()->basic_type() != bt) {
384 same_type = false;
385 break;
386 }
387 }
388 if (same_type) {
389 max_vector = cur_max_vector;
390 flag_small_bt = true;
391 cl->mark_subword_loop();
392 }
393 }
394 }
395 }
396 }
397 // We only process post loops on predicated targets where we want to
398 // mask map the loop to a single iteration
399 if (post_loop_allowed) {
400 _post_block.at_put_grow(rpo_idx++, n);
401 }
402 }
403 }
404 if (is_slp) {
405 local_loop_unroll_factor = max_vector;
406 cl->mark_passed_slp();
407 }
408 cl->mark_was_slp();
409 if (cl->is_main_loop()) {
410 cl->set_slp_max_unroll(local_loop_unroll_factor);
411 } else if (post_loop_allowed) {
412 if (!small_basic_type) {
413 // avoid replication context for small basic types in programmable masked loops
414 cl->set_slp_max_unroll(local_loop_unroll_factor);
415 }
416 }
417 }
418 }
419
420 //------------------------------SLP_extract---------------------------
421 // Extract the superword level parallelism
422 //
423 // 1) A reverse post-order of nodes in the block is constructed. By scanning
424 // this list from first to last, all definitions are visited before their uses.
425 //
426 // 2) A point-to-point dependence graph is constructed between memory references.
427 // This simplies the upcoming "independence" checker.
428 //
429 // 3) The maximum depth in the node graph from the beginning of the block
430 // to each node is computed. This is used to prune the graph search
431 // in the independence checker.
432 //
433 // 4) For integer types, the necessary bit width is propagated backwards
434 // from stores to allow packed operations on byte, char, and short
435 // integers. This reverses the promotion to type "int" that javac
436 // did for operations like: char c1,c2,c3; c1 = c2 + c3.
437 //
438 // 5) One of the memory references is picked to be an aligned vector reference.
439 // The pre-loop trip count is adjusted to align this reference in the
440 // unrolled body.
441 //
442 // 6) The initial set of pack pairs is seeded with memory references.
443 //
444 // 7) The set of pack pairs is extended by following use->def and def->use links.
445 //
446 // 8) The pairs are combined into vector sized packs.
447 //
448 // 9) Reorder the memory slices to co-locate members of the memory packs.
449 //
450 // 10) Generate ideal vector nodes for the final set of packs and where necessary,
451 // inserting scalar promotion, vector creation from multiple scalars, and
452 // extraction of scalar values from vectors.
453 //
SLP_extract()454 void SuperWord::SLP_extract() {
455
456 #ifndef PRODUCT
457 if (_do_vector_loop && TraceSuperWord) {
458 tty->print("SuperWord::SLP_extract\n");
459 tty->print("input loop\n");
460 _lpt->dump_head();
461 _lpt->dump();
462 for (uint i = 0; i < _lpt->_body.size(); i++) {
463 _lpt->_body.at(i)->dump();
464 }
465 }
466 #endif
467 // Ready the block
468 if (!construct_bb()) {
469 return; // Exit if no interesting nodes or complex graph.
470 }
471
472 // build _dg, _disjoint_ptrs
473 dependence_graph();
474
475 // compute function depth(Node*)
476 compute_max_depth();
477
478 CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
479 bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
480 if (cl->is_main_loop()) {
481 if (_do_vector_loop_experimental) {
482 if (mark_generations() != -1) {
483 hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
484
485 if (!construct_bb()) {
486 return; // Exit if no interesting nodes or complex graph.
487 }
488 dependence_graph();
489 compute_max_depth();
490 }
491
492 #ifndef PRODUCT
493 if (TraceSuperWord) {
494 tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
495 _lpt->dump_head();
496 for (int j = 0; j < _block.length(); j++) {
497 Node* n = _block.at(j);
498 int d = depth(n);
499 for (int i = 0; i < d; i++) tty->print("%s", " ");
500 tty->print("%d :", d);
501 n->dump();
502 }
503 }
504 #endif
505 }
506
507 compute_vector_element_type();
508
509 // Attempt vectorization
510
511 find_adjacent_refs();
512
513 if (align_to_ref() == NULL) {
514 return; // Did not find memory reference to align vectors
515 }
516
517 extend_packlist();
518
519 if (_do_vector_loop_experimental) {
520 if (_packset.length() == 0) {
521 #ifndef PRODUCT
522 if (TraceSuperWord) {
523 tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
524 }
525 #endif
526 pack_parallel();
527 }
528 }
529
530 combine_packs();
531
532 construct_my_pack_map();
533 if (UseVectorCmov) {
534 merge_packs_to_cmovd();
535 }
536
537 filter_packs();
538
539 schedule();
540 } else if (post_loop_allowed) {
541 int saved_mapped_unroll_factor = cl->slp_max_unroll();
542 if (saved_mapped_unroll_factor) {
543 int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
544
545 // now reset the slp_unroll_factor so that we can check the analysis mapped
546 // what the vector loop was mapped to
547 cl->set_slp_max_unroll(0);
548
549 // do the analysis on the post loop
550 unrolling_analysis(vector_mapped_unroll_factor);
551
552 // if our analyzed loop is a canonical fit, start processing it
553 if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
554 // now add the vector nodes to packsets
555 for (int i = 0; i < _post_block.length(); i++) {
556 Node* n = _post_block.at(i);
557 Node_List* singleton = new Node_List();
558 singleton->push(n);
559 _packset.append(singleton);
560 set_my_pack(n, singleton);
561 }
562
563 // map base types for vector usage
564 compute_vector_element_type();
565 } else {
566 return;
567 }
568 } else {
569 // for some reason we could not map the slp analysis state of the vectorized loop
570 return;
571 }
572 }
573
574 output();
575 }
576
577 //------------------------------find_adjacent_refs---------------------------
578 // Find the adjacent memory references and create pack pairs for them.
579 // This is the initial set of packs that will then be extended by
580 // following use->def and def->use links. The align positions are
581 // assigned relative to the reference "align_to_ref"
find_adjacent_refs()582 void SuperWord::find_adjacent_refs() {
583 // Get list of memory operations
584 Node_List memops;
585 for (int i = 0; i < _block.length(); i++) {
586 Node* n = _block.at(i);
587 if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
588 is_java_primitive(n->as_Mem()->memory_type())) {
589 int align = memory_alignment(n->as_Mem(), 0);
590 if (align != bottom_align) {
591 memops.push(n);
592 }
593 }
594 }
595 if (TraceSuperWord) {
596 tty->print_cr("\nfind_adjacent_refs found %d memops", memops.size());
597 }
598
599 Node_List align_to_refs;
600 int max_idx;
601 int best_iv_adjustment = 0;
602 MemNode* best_align_to_mem_ref = NULL;
603
604 while (memops.size() != 0) {
605 // Find a memory reference to align to.
606 MemNode* mem_ref = find_align_to_ref(memops, max_idx);
607 if (mem_ref == NULL) break;
608 align_to_refs.push(mem_ref);
609 int iv_adjustment = get_iv_adjustment(mem_ref);
610
611 if (best_align_to_mem_ref == NULL) {
612 // Set memory reference which is the best from all memory operations
613 // to be used for alignment. The pre-loop trip count is modified to align
614 // this reference to a vector-aligned address.
615 best_align_to_mem_ref = mem_ref;
616 best_iv_adjustment = iv_adjustment;
617 NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
618 }
619
620 SWPointer align_to_ref_p(mem_ref, this, NULL, false);
621 // Set alignment relative to "align_to_ref" for all related memory operations.
622 for (int i = memops.size() - 1; i >= 0; i--) {
623 MemNode* s = memops.at(i)->as_Mem();
624 if (isomorphic(s, mem_ref) &&
625 (!_do_vector_loop || same_origin_idx(s, mem_ref))) {
626 SWPointer p2(s, this, NULL, false);
627 if (p2.comparable(align_to_ref_p)) {
628 int align = memory_alignment(s, iv_adjustment);
629 set_alignment(s, align);
630 }
631 }
632 }
633
634 // Create initial pack pairs of memory operations for which
635 // alignment is set and vectors will be aligned.
636 bool create_pack = true;
637 if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
638 if (vectors_should_be_aligned()) {
639 int vw = vector_width(mem_ref);
640 int vw_best = vector_width(best_align_to_mem_ref);
641 if (vw > vw_best) {
642 // Do not vectorize a memory access with more elements per vector
643 // if unaligned memory access is not allowed because number of
644 // iterations in pre-loop will be not enough to align it.
645 create_pack = false;
646 } else {
647 SWPointer p2(best_align_to_mem_ref, this, NULL, false);
648 if (!align_to_ref_p.invar_equals(p2)) {
649 // Do not vectorize memory accesses with different invariants
650 // if unaligned memory accesses are not allowed.
651 create_pack = false;
652 }
653 }
654 }
655 } else {
656 if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
657 // Can't allow vectorization of unaligned memory accesses with the
658 // same type since it could be overlapped accesses to the same array.
659 create_pack = false;
660 } else {
661 // Allow independent (different type) unaligned memory operations
662 // if HW supports them.
663 if (vectors_should_be_aligned()) {
664 create_pack = false;
665 } else {
666 // Check if packs of the same memory type but
667 // with a different alignment were created before.
668 for (uint i = 0; i < align_to_refs.size(); i++) {
669 MemNode* mr = align_to_refs.at(i)->as_Mem();
670 if (mr == mem_ref) {
671 // Skip when we are looking at same memory operation.
672 continue;
673 }
674 if (same_velt_type(mr, mem_ref) &&
675 memory_alignment(mr, iv_adjustment) != 0)
676 create_pack = false;
677 }
678 }
679 }
680 }
681 if (create_pack) {
682 for (uint i = 0; i < memops.size(); i++) {
683 Node* s1 = memops.at(i);
684 int align = alignment(s1);
685 if (align == top_align) continue;
686 for (uint j = 0; j < memops.size(); j++) {
687 Node* s2 = memops.at(j);
688 if (alignment(s2) == top_align) continue;
689 if (s1 != s2 && are_adjacent_refs(s1, s2)) {
690 if (stmts_can_pack(s1, s2, align)) {
691 Node_List* pair = new Node_List();
692 pair->push(s1);
693 pair->push(s2);
694 if (!_do_vector_loop || same_origin_idx(s1, s2)) {
695 _packset.append(pair);
696 }
697 }
698 }
699 }
700 }
701 } else { // Don't create unaligned pack
702 // First, remove remaining memory ops of the same type from the list.
703 for (int i = memops.size() - 1; i >= 0; i--) {
704 MemNode* s = memops.at(i)->as_Mem();
705 if (same_velt_type(s, mem_ref)) {
706 memops.remove(i);
707 }
708 }
709
710 // Second, remove already constructed packs of the same type.
711 for (int i = _packset.length() - 1; i >= 0; i--) {
712 Node_List* p = _packset.at(i);
713 MemNode* s = p->at(0)->as_Mem();
714 if (same_velt_type(s, mem_ref)) {
715 remove_pack_at(i);
716 }
717 }
718
719 // If needed find the best memory reference for loop alignment again.
720 if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
721 // Put memory ops from remaining packs back on memops list for
722 // the best alignment search.
723 uint orig_msize = memops.size();
724 for (int i = 0; i < _packset.length(); i++) {
725 Node_List* p = _packset.at(i);
726 MemNode* s = p->at(0)->as_Mem();
727 assert(!same_velt_type(s, mem_ref), "sanity");
728 memops.push(s);
729 }
730 best_align_to_mem_ref = find_align_to_ref(memops, max_idx);
731 if (best_align_to_mem_ref == NULL) {
732 if (TraceSuperWord) {
733 tty->print_cr("SuperWord::find_adjacent_refs(): best_align_to_mem_ref == NULL");
734 }
735 // best_align_to_mem_ref will be used for adjusting the pre-loop limit in
736 // SuperWord::align_initial_loop_index. Find one with the biggest vector size,
737 // smallest data size and smallest iv offset from memory ops from remaining packs.
738 if (_packset.length() > 0) {
739 if (orig_msize == 0) {
740 best_align_to_mem_ref = memops.at(max_idx)->as_Mem();
741 } else {
742 for (uint i = 0; i < orig_msize; i++) {
743 memops.remove(0);
744 }
745 best_align_to_mem_ref = find_align_to_ref(memops, max_idx);
746 assert(best_align_to_mem_ref == NULL, "sanity");
747 best_align_to_mem_ref = memops.at(max_idx)->as_Mem();
748 }
749 assert(best_align_to_mem_ref != NULL, "sanity");
750 }
751 break;
752 }
753 best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
754 NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
755 // Restore list.
756 while (memops.size() > orig_msize)
757 (void)memops.pop();
758 }
759 } // unaligned memory accesses
760
761 // Remove used mem nodes.
762 for (int i = memops.size() - 1; i >= 0; i--) {
763 MemNode* m = memops.at(i)->as_Mem();
764 if (alignment(m) != top_align) {
765 memops.remove(i);
766 }
767 }
768
769 } // while (memops.size() != 0
770 set_align_to_ref(best_align_to_mem_ref);
771
772 if (TraceSuperWord) {
773 tty->print_cr("\nAfter find_adjacent_refs");
774 print_packset();
775 }
776 }
777
778 #ifndef PRODUCT
find_adjacent_refs_trace_1(Node * best_align_to_mem_ref,int best_iv_adjustment)779 void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment) {
780 if (is_trace_adjacent()) {
781 tty->print("SuperWord::find_adjacent_refs best_align_to_mem_ref = %d, best_iv_adjustment = %d",
782 best_align_to_mem_ref->_idx, best_iv_adjustment);
783 best_align_to_mem_ref->dump();
784 }
785 }
786 #endif
787
788 //------------------------------find_align_to_ref---------------------------
789 // Find a memory reference to align the loop induction variable to.
790 // Looks first at stores then at loads, looking for a memory reference
791 // with the largest number of references similar to it.
find_align_to_ref(Node_List & memops,int & idx)792 MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
793 GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
794
795 // Count number of comparable memory ops
796 for (uint i = 0; i < memops.size(); i++) {
797 MemNode* s1 = memops.at(i)->as_Mem();
798 SWPointer p1(s1, this, NULL, false);
799 // Only discard unalignable memory references if vector memory references
800 // should be aligned on this platform.
801 if (vectors_should_be_aligned() && !ref_is_alignable(p1)) {
802 *cmp_ct.adr_at(i) = 0;
803 continue;
804 }
805 for (uint j = i+1; j < memops.size(); j++) {
806 MemNode* s2 = memops.at(j)->as_Mem();
807 if (isomorphic(s1, s2)) {
808 SWPointer p2(s2, this, NULL, false);
809 if (p1.comparable(p2)) {
810 (*cmp_ct.adr_at(i))++;
811 (*cmp_ct.adr_at(j))++;
812 }
813 }
814 }
815 }
816
817 // Find Store (or Load) with the greatest number of "comparable" references,
818 // biggest vector size, smallest data size and smallest iv offset.
819 int max_ct = 0;
820 int max_vw = 0;
821 int max_idx = -1;
822 int min_size = max_jint;
823 int min_iv_offset = max_jint;
824 for (uint j = 0; j < memops.size(); j++) {
825 MemNode* s = memops.at(j)->as_Mem();
826 if (s->is_Store()) {
827 int vw = vector_width_in_bytes(s);
828 assert(vw > 1, "sanity");
829 SWPointer p(s, this, NULL, false);
830 if ( cmp_ct.at(j) > max_ct ||
831 (cmp_ct.at(j) == max_ct &&
832 ( vw > max_vw ||
833 (vw == max_vw &&
834 ( data_size(s) < min_size ||
835 (data_size(s) == min_size &&
836 p.offset_in_bytes() < min_iv_offset)))))) {
837 max_ct = cmp_ct.at(j);
838 max_vw = vw;
839 max_idx = j;
840 min_size = data_size(s);
841 min_iv_offset = p.offset_in_bytes();
842 }
843 }
844 }
845 // If no stores, look at loads
846 if (max_ct == 0) {
847 for (uint j = 0; j < memops.size(); j++) {
848 MemNode* s = memops.at(j)->as_Mem();
849 if (s->is_Load()) {
850 int vw = vector_width_in_bytes(s);
851 assert(vw > 1, "sanity");
852 SWPointer p(s, this, NULL, false);
853 if ( cmp_ct.at(j) > max_ct ||
854 (cmp_ct.at(j) == max_ct &&
855 ( vw > max_vw ||
856 (vw == max_vw &&
857 ( data_size(s) < min_size ||
858 (data_size(s) == min_size &&
859 p.offset_in_bytes() < min_iv_offset)))))) {
860 max_ct = cmp_ct.at(j);
861 max_vw = vw;
862 max_idx = j;
863 min_size = data_size(s);
864 min_iv_offset = p.offset_in_bytes();
865 }
866 }
867 }
868 }
869
870 #ifdef ASSERT
871 if (TraceSuperWord && Verbose) {
872 tty->print_cr("\nVector memops after find_align_to_ref");
873 for (uint i = 0; i < memops.size(); i++) {
874 MemNode* s = memops.at(i)->as_Mem();
875 s->dump();
876 }
877 }
878 #endif
879
880 idx = max_idx;
881 if (max_ct > 0) {
882 #ifdef ASSERT
883 if (TraceSuperWord) {
884 tty->print("\nVector align to node: ");
885 memops.at(max_idx)->as_Mem()->dump();
886 }
887 #endif
888 return memops.at(max_idx)->as_Mem();
889 }
890 return NULL;
891 }
892
893 //------------------span_works_for_memory_size-----------------------------
span_works_for_memory_size(MemNode * mem,int span,int mem_size,int offset)894 static bool span_works_for_memory_size(MemNode* mem, int span, int mem_size, int offset) {
895 bool span_matches_memory = false;
896 if ((mem_size == type2aelembytes(T_BYTE) || mem_size == type2aelembytes(T_SHORT))
897 && ABS(span) == type2aelembytes(T_INT)) {
898 // There is a mismatch on span size compared to memory.
899 for (DUIterator_Fast jmax, j = mem->fast_outs(jmax); j < jmax; j++) {
900 Node* use = mem->fast_out(j);
901 if (!VectorNode::is_type_transition_to_int(use)) {
902 return false;
903 }
904 }
905 // If all uses transition to integer, it means that we can successfully align even on mismatch.
906 return true;
907 }
908 else {
909 span_matches_memory = ABS(span) == mem_size;
910 }
911 return span_matches_memory && (ABS(offset) % mem_size) == 0;
912 }
913
914 //------------------------------ref_is_alignable---------------------------
915 // Can the preloop align the reference to position zero in the vector?
ref_is_alignable(SWPointer & p)916 bool SuperWord::ref_is_alignable(SWPointer& p) {
917 if (!p.has_iv()) {
918 return true; // no induction variable
919 }
920 CountedLoopEndNode* pre_end = pre_loop_end();
921 assert(pre_end->stride_is_con(), "pre loop stride is constant");
922 int preloop_stride = pre_end->stride_con();
923
924 int span = preloop_stride * p.scale_in_bytes();
925 int mem_size = p.memory_size();
926 int offset = p.offset_in_bytes();
927 // Stride one accesses are alignable if offset is aligned to memory operation size.
928 // Offset can be unaligned when UseUnalignedAccesses is used.
929 if (span_works_for_memory_size(p.mem(), span, mem_size, offset)) {
930 return true;
931 }
932 // If the initial offset from start of the object is computable,
933 // check if the pre-loop can align the final offset accordingly.
934 //
935 // In other words: Can we find an i such that the offset
936 // after i pre-loop iterations is aligned to vw?
937 // (init_offset + pre_loop) % vw == 0 (1)
938 // where
939 // pre_loop = i * span
940 // is the number of bytes added to the offset by i pre-loop iterations.
941 //
942 // For this to hold we need pre_loop to increase init_offset by
943 // pre_loop = vw - (init_offset % vw)
944 //
945 // This is only possible if pre_loop is divisible by span because each
946 // pre-loop iteration increases the initial offset by 'span' bytes:
947 // (vw - (init_offset % vw)) % span == 0
948 //
949 int vw = vector_width_in_bytes(p.mem());
950 assert(vw > 1, "sanity");
951 Node* init_nd = pre_end->init_trip();
952 if (init_nd->is_Con() && p.invar() == NULL) {
953 int init = init_nd->bottom_type()->is_int()->get_con();
954 int init_offset = init * p.scale_in_bytes() + offset;
955 if (init_offset < 0) { // negative offset from object start?
956 return false; // may happen in dead loop
957 }
958 if (vw % span == 0) {
959 // If vm is a multiple of span, we use formula (1).
960 if (span > 0) {
961 return (vw - (init_offset % vw)) % span == 0;
962 } else {
963 assert(span < 0, "nonzero stride * scale");
964 return (init_offset % vw) % -span == 0;
965 }
966 } else if (span % vw == 0) {
967 // If span is a multiple of vw, we can simplify formula (1) to:
968 // (init_offset + i * span) % vw == 0
969 // =>
970 // (init_offset % vw) + ((i * span) % vw) == 0
971 // =>
972 // init_offset % vw == 0
973 //
974 // Because we add a multiple of vw to the initial offset, the final
975 // offset is a multiple of vw if and only if init_offset is a multiple.
976 //
977 return (init_offset % vw) == 0;
978 }
979 }
980 return false;
981 }
982 //---------------------------get_vw_bytes_special------------------------
get_vw_bytes_special(MemNode * s)983 int SuperWord::get_vw_bytes_special(MemNode* s) {
984 // Get the vector width in bytes.
985 int vw = vector_width_in_bytes(s);
986
987 // Check for special case where there is an MulAddS2I usage where short vectors are going to need combined.
988 BasicType btype = velt_basic_type(s);
989 if (type2aelembytes(btype) == 2) {
990 bool should_combine_adjacent = true;
991 for (DUIterator_Fast imax, i = s->fast_outs(imax); i < imax; i++) {
992 Node* user = s->fast_out(i);
993 if (!VectorNode::is_muladds2i(user)) {
994 should_combine_adjacent = false;
995 }
996 }
997 if (should_combine_adjacent) {
998 vw = MIN2(Matcher::max_vector_size(btype)*type2aelembytes(btype), vw * 2);
999 }
1000 }
1001
1002 return vw;
1003 }
1004
1005 //---------------------------get_iv_adjustment---------------------------
1006 // Calculate loop's iv adjustment for this memory ops.
get_iv_adjustment(MemNode * mem_ref)1007 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
1008 SWPointer align_to_ref_p(mem_ref, this, NULL, false);
1009 int offset = align_to_ref_p.offset_in_bytes();
1010 int scale = align_to_ref_p.scale_in_bytes();
1011 int elt_size = align_to_ref_p.memory_size();
1012 int vw = get_vw_bytes_special(mem_ref);
1013 assert(vw > 1, "sanity");
1014 int iv_adjustment;
1015 if (scale != 0) {
1016 int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
1017 // At least one iteration is executed in pre-loop by default. As result
1018 // several iterations are needed to align memory operations in main-loop even
1019 // if offset is 0.
1020 int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
1021 // iv_adjustment_in_bytes must be a multiple of elt_size if vector memory
1022 // references should be aligned on this platform.
1023 assert((ABS(iv_adjustment_in_bytes) % elt_size) == 0 || !vectors_should_be_aligned(),
1024 "(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size);
1025 iv_adjustment = iv_adjustment_in_bytes/elt_size;
1026 } else {
1027 // This memory op is not dependent on iv (scale == 0)
1028 iv_adjustment = 0;
1029 }
1030
1031 #ifndef PRODUCT
1032 if (TraceSuperWord) {
1033 tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
1034 mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
1035 mem_ref->dump();
1036 }
1037 #endif
1038 return iv_adjustment;
1039 }
1040
1041 //---------------------------dependence_graph---------------------------
1042 // Construct dependency graph.
1043 // Add dependence edges to load/store nodes for memory dependence
1044 // A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
dependence_graph()1045 void SuperWord::dependence_graph() {
1046 CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
1047 // First, assign a dependence node to each memory node
1048 for (int i = 0; i < _block.length(); i++ ) {
1049 Node *n = _block.at(i);
1050 if (n->is_Mem() || (n->is_Phi() && n->bottom_type() == Type::MEMORY)) {
1051 _dg.make_node(n);
1052 }
1053 }
1054
1055 // For each memory slice, create the dependences
1056 for (int i = 0; i < _mem_slice_head.length(); i++) {
1057 Node* n = _mem_slice_head.at(i);
1058 Node* n_tail = _mem_slice_tail.at(i);
1059
1060 // Get slice in predecessor order (last is first)
1061 if (cl->is_main_loop()) {
1062 mem_slice_preds(n_tail, n, _nlist);
1063 }
1064
1065 #ifndef PRODUCT
1066 if(TraceSuperWord && Verbose) {
1067 tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
1068 for (int j = _nlist.length() - 1; j >= 0 ; j--) {
1069 _nlist.at(j)->dump();
1070 }
1071 }
1072 #endif
1073 // Make the slice dependent on the root
1074 DepMem* slice = _dg.dep(n);
1075 _dg.make_edge(_dg.root(), slice);
1076
1077 // Create a sink for the slice
1078 DepMem* slice_sink = _dg.make_node(NULL);
1079 _dg.make_edge(slice_sink, _dg.tail());
1080
1081 // Now visit each pair of memory ops, creating the edges
1082 for (int j = _nlist.length() - 1; j >= 0 ; j--) {
1083 Node* s1 = _nlist.at(j);
1084
1085 // If no dependency yet, use slice
1086 if (_dg.dep(s1)->in_cnt() == 0) {
1087 _dg.make_edge(slice, s1);
1088 }
1089 SWPointer p1(s1->as_Mem(), this, NULL, false);
1090 bool sink_dependent = true;
1091 for (int k = j - 1; k >= 0; k--) {
1092 Node* s2 = _nlist.at(k);
1093 if (s1->is_Load() && s2->is_Load())
1094 continue;
1095 SWPointer p2(s2->as_Mem(), this, NULL, false);
1096
1097 int cmp = p1.cmp(p2);
1098 if (SuperWordRTDepCheck &&
1099 p1.base() != p2.base() && p1.valid() && p2.valid()) {
1100 // Create a runtime check to disambiguate
1101 OrderedPair pp(p1.base(), p2.base());
1102 _disjoint_ptrs.append_if_missing(pp);
1103 } else if (!SWPointer::not_equal(cmp)) {
1104 // Possibly same address
1105 _dg.make_edge(s1, s2);
1106 sink_dependent = false;
1107 }
1108 }
1109 if (sink_dependent) {
1110 _dg.make_edge(s1, slice_sink);
1111 }
1112 }
1113
1114 if (TraceSuperWord) {
1115 tty->print_cr("\nDependence graph for slice: %d", n->_idx);
1116 for (int q = 0; q < _nlist.length(); q++) {
1117 _dg.print(_nlist.at(q));
1118 }
1119 tty->cr();
1120 }
1121
1122 _nlist.clear();
1123 }
1124
1125 if (TraceSuperWord) {
1126 tty->print_cr("\ndisjoint_ptrs: %s", _disjoint_ptrs.length() > 0 ? "" : "NONE");
1127 for (int r = 0; r < _disjoint_ptrs.length(); r++) {
1128 _disjoint_ptrs.at(r).print();
1129 tty->cr();
1130 }
1131 tty->cr();
1132 }
1133
1134 }
1135
1136 //---------------------------mem_slice_preds---------------------------
1137 // Return a memory slice (node list) in predecessor order starting at "start"
mem_slice_preds(Node * start,Node * stop,GrowableArray<Node * > & preds)1138 void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &preds) {
1139 assert(preds.length() == 0, "start empty");
1140 Node* n = start;
1141 Node* prev = NULL;
1142 while (true) {
1143 NOT_PRODUCT( if(is_trace_mem_slice()) tty->print_cr("SuperWord::mem_slice_preds: n %d", n->_idx);)
1144 assert(in_bb(n), "must be in block");
1145 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
1146 Node* out = n->fast_out(i);
1147 if (out->is_Load()) {
1148 if (in_bb(out)) {
1149 preds.push(out);
1150 if (TraceSuperWord && Verbose) {
1151 tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", out->_idx);
1152 }
1153 }
1154 } else {
1155 // FIXME
1156 if (out->is_MergeMem() && !in_bb(out)) {
1157 // Either unrolling is causing a memory edge not to disappear,
1158 // or need to run igvn.optimize() again before SLP
1159 } else if (out->is_Phi() && out->bottom_type() == Type::MEMORY && !in_bb(out)) {
1160 // Ditto. Not sure what else to check further.
1161 } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
1162 // StoreCM has an input edge used as a precedence edge.
1163 // Maybe an issue when oop stores are vectorized.
1164 } else {
1165 assert(out == prev || prev == NULL, "no branches off of store slice");
1166 }
1167 }//else
1168 }//for
1169 if (n == stop) break;
1170 preds.push(n);
1171 if (TraceSuperWord && Verbose) {
1172 tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", n->_idx);
1173 }
1174 prev = n;
1175 assert(n->is_Mem(), "unexpected node %s", n->Name());
1176 n = n->in(MemNode::Memory);
1177 }
1178 }
1179
1180 //------------------------------stmts_can_pack---------------------------
1181 // Can s1 and s2 be in a pack with s1 immediately preceding s2 and
1182 // s1 aligned at "align"
stmts_can_pack(Node * s1,Node * s2,int align)1183 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
1184
1185 // Do not use superword for non-primitives
1186 BasicType bt1 = velt_basic_type(s1);
1187 BasicType bt2 = velt_basic_type(s2);
1188 if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
1189 return false;
1190 if (Matcher::max_vector_size(bt1) < 2) {
1191 return false; // No vectors for this type
1192 }
1193
1194 if (isomorphic(s1, s2)) {
1195 if ((independent(s1, s2) && have_similar_inputs(s1, s2)) || reduction(s1, s2)) {
1196 if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
1197 if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
1198 int s1_align = alignment(s1);
1199 int s2_align = alignment(s2);
1200 if (s1_align == top_align || s1_align == align) {
1201 if (s2_align == top_align || s2_align == align + data_size(s1)) {
1202 return true;
1203 }
1204 }
1205 }
1206 }
1207 }
1208 }
1209 return false;
1210 }
1211
1212 //------------------------------exists_at---------------------------
1213 // Does s exist in a pack at position pos?
exists_at(Node * s,uint pos)1214 bool SuperWord::exists_at(Node* s, uint pos) {
1215 for (int i = 0; i < _packset.length(); i++) {
1216 Node_List* p = _packset.at(i);
1217 if (p->at(pos) == s) {
1218 return true;
1219 }
1220 }
1221 return false;
1222 }
1223
1224 //------------------------------are_adjacent_refs---------------------------
1225 // Is s1 immediately before s2 in memory?
are_adjacent_refs(Node * s1,Node * s2)1226 bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
1227 if (!s1->is_Mem() || !s2->is_Mem()) return false;
1228 if (!in_bb(s1) || !in_bb(s2)) return false;
1229
1230 // Do not use superword for non-primitives
1231 if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
1232 !is_java_primitive(s2->as_Mem()->memory_type())) {
1233 return false;
1234 }
1235
1236 // FIXME - co_locate_pack fails on Stores in different mem-slices, so
1237 // only pack memops that are in the same alias set until that's fixed.
1238 if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
1239 _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
1240 return false;
1241 SWPointer p1(s1->as_Mem(), this, NULL, false);
1242 SWPointer p2(s2->as_Mem(), this, NULL, false);
1243 if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
1244 int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
1245 return diff == data_size(s1);
1246 }
1247
1248 //------------------------------isomorphic---------------------------
1249 // Are s1 and s2 similar?
isomorphic(Node * s1,Node * s2)1250 bool SuperWord::isomorphic(Node* s1, Node* s2) {
1251 if (s1->Opcode() != s2->Opcode()) return false;
1252 if (s1->req() != s2->req()) return false;
1253 if (!same_velt_type(s1, s2)) return false;
1254 Node* s1_ctrl = s1->in(0);
1255 Node* s2_ctrl = s2->in(0);
1256 // If the control nodes are equivalent, no further checks are required to test for isomorphism.
1257 if (s1_ctrl == s2_ctrl) {
1258 return true;
1259 } else {
1260 bool s1_ctrl_inv = ((s1_ctrl == NULL) ? true : lpt()->is_invariant(s1_ctrl));
1261 bool s2_ctrl_inv = ((s2_ctrl == NULL) ? true : lpt()->is_invariant(s2_ctrl));
1262 // If the control nodes are not invariant for the loop, fail isomorphism test.
1263 if (!s1_ctrl_inv || !s2_ctrl_inv) {
1264 return false;
1265 }
1266 if(s1_ctrl != NULL && s2_ctrl != NULL) {
1267 if (s1_ctrl->is_Proj()) {
1268 s1_ctrl = s1_ctrl->in(0);
1269 assert(lpt()->is_invariant(s1_ctrl), "must be invariant");
1270 }
1271 if (s2_ctrl->is_Proj()) {
1272 s2_ctrl = s2_ctrl->in(0);
1273 assert(lpt()->is_invariant(s2_ctrl), "must be invariant");
1274 }
1275 if (!s1_ctrl->is_RangeCheck() || !s2_ctrl->is_RangeCheck()) {
1276 return false;
1277 }
1278 }
1279 // Control nodes are invariant. However, we have no way of checking whether they resolve
1280 // in an equivalent manner. But, we know that invariant range checks are guaranteed to
1281 // throw before the loop (if they would have thrown). Thus, the loop would not have been reached.
1282 // Therefore, if the control nodes for both are range checks, we accept them to be isomorphic.
1283 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1284 Node* t1 = s1->fast_out(i);
1285 for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
1286 Node* t2 = s2->fast_out(j);
1287 if (VectorNode::is_muladds2i(t1) && VectorNode::is_muladds2i(t2)) {
1288 return true;
1289 }
1290 }
1291 }
1292 }
1293 return false;
1294 }
1295
1296 //------------------------------independent---------------------------
1297 // Is there no data path from s1 to s2 or s2 to s1?
independent(Node * s1,Node * s2)1298 bool SuperWord::independent(Node* s1, Node* s2) {
1299 // assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
1300 int d1 = depth(s1);
1301 int d2 = depth(s2);
1302 if (d1 == d2) return s1 != s2;
1303 Node* deep = d1 > d2 ? s1 : s2;
1304 Node* shallow = d1 > d2 ? s2 : s1;
1305
1306 visited_clear();
1307
1308 return independent_path(shallow, deep);
1309 }
1310
1311 //--------------------------have_similar_inputs-----------------------
1312 // For a node pair (s1, s2) which is isomorphic and independent,
1313 // do s1 and s2 have similar input edges?
have_similar_inputs(Node * s1,Node * s2)1314 bool SuperWord::have_similar_inputs(Node* s1, Node* s2) {
1315 // assert(isomorphic(s1, s2) == true, "check isomorphic");
1316 // assert(independent(s1, s2) == true, "check independent");
1317 if (s1->req() > 1 && !s1->is_Store() && !s1->is_Load()) {
1318 for (uint i = 1; i < s1->req(); i++) {
1319 if (s1->in(i)->Opcode() != s2->in(i)->Opcode()) return false;
1320 }
1321 }
1322 return true;
1323 }
1324
1325 //------------------------------reduction---------------------------
1326 // Is there a data path between s1 and s2 and the nodes reductions?
reduction(Node * s1,Node * s2)1327 bool SuperWord::reduction(Node* s1, Node* s2) {
1328 bool retValue = false;
1329 int d1 = depth(s1);
1330 int d2 = depth(s2);
1331 if (d2 > d1) {
1332 if (s1->is_reduction() && s2->is_reduction()) {
1333 // This is an ordered set, so s1 should define s2
1334 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1335 Node* t1 = s1->fast_out(i);
1336 if (t1 == s2) {
1337 // both nodes are reductions and connected
1338 retValue = true;
1339 }
1340 }
1341 }
1342 }
1343
1344 return retValue;
1345 }
1346
1347 //------------------------------independent_path------------------------------
1348 // Helper for independent
independent_path(Node * shallow,Node * deep,uint dp)1349 bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
1350 if (dp >= 1000) return false; // stop deep recursion
1351 visited_set(deep);
1352 int shal_depth = depth(shallow);
1353 assert(shal_depth <= depth(deep), "must be");
1354 for (DepPreds preds(deep, _dg); !preds.done(); preds.next()) {
1355 Node* pred = preds.current();
1356 if (in_bb(pred) && !visited_test(pred)) {
1357 if (shallow == pred) {
1358 return false;
1359 }
1360 if (shal_depth < depth(pred) && !independent_path(shallow, pred, dp+1)) {
1361 return false;
1362 }
1363 }
1364 }
1365 return true;
1366 }
1367
1368 //------------------------------set_alignment---------------------------
set_alignment(Node * s1,Node * s2,int align)1369 void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
1370 set_alignment(s1, align);
1371 if (align == top_align || align == bottom_align) {
1372 set_alignment(s2, align);
1373 } else {
1374 set_alignment(s2, align + data_size(s1));
1375 }
1376 }
1377
1378 //------------------------------data_size---------------------------
data_size(Node * s)1379 int SuperWord::data_size(Node* s) {
1380 Node* use = NULL; //test if the node is a candidate for CMoveV optimization, then return the size of CMov
1381 if (UseVectorCmov) {
1382 use = _cmovev_kit.is_Bool_candidate(s);
1383 if (use != NULL) {
1384 return data_size(use);
1385 }
1386 use = _cmovev_kit.is_CmpD_candidate(s);
1387 if (use != NULL) {
1388 return data_size(use);
1389 }
1390 }
1391
1392 int bsize = type2aelembytes(velt_basic_type(s));
1393 assert(bsize != 0, "valid size");
1394 return bsize;
1395 }
1396
1397 //------------------------------extend_packlist---------------------------
1398 // Extend packset by following use->def and def->use links from pack members.
extend_packlist()1399 void SuperWord::extend_packlist() {
1400 bool changed;
1401 do {
1402 packset_sort(_packset.length());
1403 changed = false;
1404 for (int i = 0; i < _packset.length(); i++) {
1405 Node_List* p = _packset.at(i);
1406 changed |= follow_use_defs(p);
1407 changed |= follow_def_uses(p);
1408 }
1409 } while (changed);
1410
1411 if (_race_possible) {
1412 for (int i = 0; i < _packset.length(); i++) {
1413 Node_List* p = _packset.at(i);
1414 order_def_uses(p);
1415 }
1416 }
1417
1418 if (TraceSuperWord) {
1419 tty->print_cr("\nAfter extend_packlist");
1420 print_packset();
1421 }
1422 }
1423
1424 //------------------------------follow_use_defs---------------------------
1425 // Extend the packset by visiting operand definitions of nodes in pack p
follow_use_defs(Node_List * p)1426 bool SuperWord::follow_use_defs(Node_List* p) {
1427 assert(p->size() == 2, "just checking");
1428 Node* s1 = p->at(0);
1429 Node* s2 = p->at(1);
1430 assert(s1->req() == s2->req(), "just checking");
1431 assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1432
1433 if (s1->is_Load()) return false;
1434
1435 int align = alignment(s1);
1436 NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: s1 %d, align %d", s1->_idx, align);)
1437 bool changed = false;
1438 int start = s1->is_Store() ? MemNode::ValueIn : 1;
1439 int end = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
1440 for (int j = start; j < end; j++) {
1441 Node* t1 = s1->in(j);
1442 Node* t2 = s2->in(j);
1443 if (!in_bb(t1) || !in_bb(t2))
1444 continue;
1445 if (stmts_can_pack(t1, t2, align)) {
1446 if (est_savings(t1, t2) >= 0) {
1447 Node_List* pair = new Node_List();
1448 pair->push(t1);
1449 pair->push(t2);
1450 _packset.append(pair);
1451 NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: set_alignment(%d, %d, %d)", t1->_idx, t2->_idx, align);)
1452 set_alignment(t1, t2, align);
1453 changed = true;
1454 }
1455 }
1456 }
1457 return changed;
1458 }
1459
1460 //------------------------------follow_def_uses---------------------------
1461 // Extend the packset by visiting uses of nodes in pack p
follow_def_uses(Node_List * p)1462 bool SuperWord::follow_def_uses(Node_List* p) {
1463 bool changed = false;
1464 Node* s1 = p->at(0);
1465 Node* s2 = p->at(1);
1466 assert(p->size() == 2, "just checking");
1467 assert(s1->req() == s2->req(), "just checking");
1468 assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1469
1470 if (s1->is_Store()) return false;
1471
1472 int align = alignment(s1);
1473 NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: s1 %d, align %d", s1->_idx, align);)
1474 int savings = -1;
1475 int num_s1_uses = 0;
1476 Node* u1 = NULL;
1477 Node* u2 = NULL;
1478 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1479 Node* t1 = s1->fast_out(i);
1480 num_s1_uses++;
1481 if (!in_bb(t1)) continue;
1482 for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
1483 Node* t2 = s2->fast_out(j);
1484 if (!in_bb(t2)) continue;
1485 if (t2->Opcode() == Op_AddI && t2 == _lp->as_CountedLoop()->incr()) continue; // don't mess with the iv
1486 if (!opnd_positions_match(s1, t1, s2, t2))
1487 continue;
1488 if (stmts_can_pack(t1, t2, align)) {
1489 int my_savings = est_savings(t1, t2);
1490 if (my_savings > savings) {
1491 savings = my_savings;
1492 u1 = t1;
1493 u2 = t2;
1494 }
1495 }
1496 }
1497 }
1498 if (num_s1_uses > 1) {
1499 _race_possible = true;
1500 }
1501 if (savings >= 0) {
1502 Node_List* pair = new Node_List();
1503 pair->push(u1);
1504 pair->push(u2);
1505 _packset.append(pair);
1506 NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: set_alignment(%d, %d, %d)", u1->_idx, u2->_idx, align);)
1507 set_alignment(u1, u2, align);
1508 changed = true;
1509 }
1510 return changed;
1511 }
1512
1513 //------------------------------order_def_uses---------------------------
1514 // For extended packsets, ordinally arrange uses packset by major component
order_def_uses(Node_List * p)1515 void SuperWord::order_def_uses(Node_List* p) {
1516 Node* s1 = p->at(0);
1517
1518 if (s1->is_Store()) return;
1519
1520 // reductions are always managed beforehand
1521 if (s1->is_reduction()) return;
1522
1523 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1524 Node* t1 = s1->fast_out(i);
1525
1526 // Only allow operand swap on commuting operations
1527 if (!t1->is_Add() && !t1->is_Mul() && !VectorNode::is_muladds2i(t1)) {
1528 break;
1529 }
1530
1531 // Now find t1's packset
1532 Node_List* p2 = NULL;
1533 for (int j = 0; j < _packset.length(); j++) {
1534 p2 = _packset.at(j);
1535 Node* first = p2->at(0);
1536 if (t1 == first) {
1537 break;
1538 }
1539 p2 = NULL;
1540 }
1541 // Arrange all sub components by the major component
1542 if (p2 != NULL) {
1543 for (uint j = 1; j < p->size(); j++) {
1544 Node* d1 = p->at(j);
1545 Node* u1 = p2->at(j);
1546 opnd_positions_match(s1, t1, d1, u1);
1547 }
1548 }
1549 }
1550 }
1551
1552 //---------------------------opnd_positions_match-------------------------
1553 // Is the use of d1 in u1 at the same operand position as d2 in u2?
opnd_positions_match(Node * d1,Node * u1,Node * d2,Node * u2)1554 bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
1555 // check reductions to see if they are marshalled to represent the reduction
1556 // operator in a specified opnd
1557 if (u1->is_reduction() && u2->is_reduction()) {
1558 // ensure reductions have phis and reduction definitions feeding the 1st operand
1559 Node* first = u1->in(2);
1560 if (first->is_Phi() || first->is_reduction()) {
1561 u1->swap_edges(1, 2);
1562 }
1563 // ensure reductions have phis and reduction definitions feeding the 1st operand
1564 first = u2->in(2);
1565 if (first->is_Phi() || first->is_reduction()) {
1566 u2->swap_edges(1, 2);
1567 }
1568 return true;
1569 }
1570
1571 uint ct = u1->req();
1572 if (ct != u2->req()) return false;
1573 uint i1 = 0;
1574 uint i2 = 0;
1575 do {
1576 for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
1577 for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
1578 if (i1 != i2) {
1579 if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
1580 // Further analysis relies on operands position matching.
1581 u2->swap_edges(i1, i2);
1582 } else if (VectorNode::is_muladds2i(u2) && u1 != u2) {
1583 if (i1 == 5 - i2) { // ((i1 == 3 && i2 == 2) || (i1 == 2 && i2 == 3) || (i1 == 1 && i2 == 4) || (i1 == 4 && i2 == 1))
1584 u2->swap_edges(1, 2);
1585 u2->swap_edges(3, 4);
1586 }
1587 if (i1 == 3 - i2 || i1 == 7 - i2) { // ((i1 == 1 && i2 == 2) || (i1 == 2 && i2 == 1) || (i1 == 3 && i2 == 4) || (i1 == 4 && i2 == 3))
1588 u2->swap_edges(2, 3);
1589 u2->swap_edges(1, 4);
1590 }
1591 return false; // Just swap the edges, the muladds2i nodes get packed in follow_use_defs
1592 } else {
1593 return false;
1594 }
1595 } else if (i1 == i2 && VectorNode::is_muladds2i(u2) && u1 != u2) {
1596 u2->swap_edges(1, 3);
1597 u2->swap_edges(2, 4);
1598 return false; // Just swap the edges, the muladds2i nodes get packed in follow_use_defs
1599 }
1600 } while (i1 < ct);
1601 return true;
1602 }
1603
1604 //------------------------------est_savings---------------------------
1605 // Estimate the savings from executing s1 and s2 as a pack
est_savings(Node * s1,Node * s2)1606 int SuperWord::est_savings(Node* s1, Node* s2) {
1607 int save_in = 2 - 1; // 2 operations per instruction in packed form
1608
1609 // inputs
1610 for (uint i = 1; i < s1->req(); i++) {
1611 Node* x1 = s1->in(i);
1612 Node* x2 = s2->in(i);
1613 if (x1 != x2) {
1614 if (are_adjacent_refs(x1, x2)) {
1615 save_in += adjacent_profit(x1, x2);
1616 } else if (!in_packset(x1, x2)) {
1617 save_in -= pack_cost(2);
1618 } else {
1619 save_in += unpack_cost(2);
1620 }
1621 }
1622 }
1623
1624 // uses of result
1625 uint ct = 0;
1626 int save_use = 0;
1627 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1628 Node* s1_use = s1->fast_out(i);
1629 for (int j = 0; j < _packset.length(); j++) {
1630 Node_List* p = _packset.at(j);
1631 if (p->at(0) == s1_use) {
1632 for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
1633 Node* s2_use = s2->fast_out(k);
1634 if (p->at(p->size()-1) == s2_use) {
1635 ct++;
1636 if (are_adjacent_refs(s1_use, s2_use)) {
1637 save_use += adjacent_profit(s1_use, s2_use);
1638 }
1639 }
1640 }
1641 }
1642 }
1643 }
1644
1645 if (ct < s1->outcnt()) save_use += unpack_cost(1);
1646 if (ct < s2->outcnt()) save_use += unpack_cost(1);
1647
1648 return MAX2(save_in, save_use);
1649 }
1650
1651 //------------------------------costs---------------------------
adjacent_profit(Node * s1,Node * s2)1652 int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; }
pack_cost(int ct)1653 int SuperWord::pack_cost(int ct) { return ct; }
unpack_cost(int ct)1654 int SuperWord::unpack_cost(int ct) { return ct; }
1655
1656 //------------------------------combine_packs---------------------------
1657 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
combine_packs()1658 void SuperWord::combine_packs() {
1659 bool changed = true;
1660 // Combine packs regardless max vector size.
1661 while (changed) {
1662 changed = false;
1663 for (int i = 0; i < _packset.length(); i++) {
1664 Node_List* p1 = _packset.at(i);
1665 if (p1 == NULL) continue;
1666 // Because of sorting we can start at i + 1
1667 for (int j = i + 1; j < _packset.length(); j++) {
1668 Node_List* p2 = _packset.at(j);
1669 if (p2 == NULL) continue;
1670 if (i == j) continue;
1671 if (p1->at(p1->size()-1) == p2->at(0)) {
1672 for (uint k = 1; k < p2->size(); k++) {
1673 p1->push(p2->at(k));
1674 }
1675 _packset.at_put(j, NULL);
1676 changed = true;
1677 }
1678 }
1679 }
1680 }
1681
1682 // Split packs which have size greater then max vector size.
1683 for (int i = 0; i < _packset.length(); i++) {
1684 Node_List* p1 = _packset.at(i);
1685 if (p1 != NULL) {
1686 BasicType bt = velt_basic_type(p1->at(0));
1687 uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
1688 assert(is_power_of_2(max_vlen), "sanity");
1689 uint psize = p1->size();
1690 if (!is_power_of_2(psize)) {
1691 // Skip pack which can't be vector.
1692 // case1: for(...) { a[i] = i; } elements values are different (i+x)
1693 // case2: for(...) { a[i] = b[i+1]; } can't align both, load and store
1694 _packset.at_put(i, NULL);
1695 continue;
1696 }
1697 if (psize > max_vlen) {
1698 Node_List* pack = new Node_List();
1699 for (uint j = 0; j < psize; j++) {
1700 pack->push(p1->at(j));
1701 if (pack->size() >= max_vlen) {
1702 assert(is_power_of_2(pack->size()), "sanity");
1703 _packset.append(pack);
1704 pack = new Node_List();
1705 }
1706 }
1707 _packset.at_put(i, NULL);
1708 }
1709 }
1710 }
1711
1712 // Compress list.
1713 for (int i = _packset.length() - 1; i >= 0; i--) {
1714 Node_List* p1 = _packset.at(i);
1715 if (p1 == NULL) {
1716 _packset.remove_at(i);
1717 }
1718 }
1719
1720 if (TraceSuperWord) {
1721 tty->print_cr("\nAfter combine_packs");
1722 print_packset();
1723 }
1724 }
1725
1726 //-----------------------------construct_my_pack_map--------------------------
1727 // Construct the map from nodes to packs. Only valid after the
1728 // point where a node is only in one pack (after combine_packs).
construct_my_pack_map()1729 void SuperWord::construct_my_pack_map() {
1730 Node_List* rslt = NULL;
1731 for (int i = 0; i < _packset.length(); i++) {
1732 Node_List* p = _packset.at(i);
1733 for (uint j = 0; j < p->size(); j++) {
1734 Node* s = p->at(j);
1735 #ifdef ASSERT
1736 if (my_pack(s) != NULL) {
1737 s->dump(1);
1738 tty->print_cr("packs[%d]:", i);
1739 print_pack(p);
1740 assert(false, "only in one pack");
1741 }
1742 #endif
1743 set_my_pack(s, p);
1744 }
1745 }
1746 }
1747
1748 //------------------------------filter_packs---------------------------
1749 // Remove packs that are not implemented or not profitable.
filter_packs()1750 void SuperWord::filter_packs() {
1751 // Remove packs that are not implemented
1752 for (int i = _packset.length() - 1; i >= 0; i--) {
1753 Node_List* pk = _packset.at(i);
1754 bool impl = implemented(pk);
1755 if (!impl) {
1756 #ifndef PRODUCT
1757 if ((TraceSuperWord && Verbose) || _vector_loop_debug) {
1758 tty->print_cr("Unimplemented");
1759 pk->at(0)->dump();
1760 }
1761 #endif
1762 remove_pack_at(i);
1763 }
1764 Node *n = pk->at(0);
1765 if (n->is_reduction()) {
1766 _num_reductions++;
1767 } else {
1768 _num_work_vecs++;
1769 }
1770 }
1771
1772 // Remove packs that are not profitable
1773 bool changed;
1774 do {
1775 changed = false;
1776 for (int i = _packset.length() - 1; i >= 0; i--) {
1777 Node_List* pk = _packset.at(i);
1778 bool prof = profitable(pk);
1779 if (!prof) {
1780 #ifndef PRODUCT
1781 if ((TraceSuperWord && Verbose) || _vector_loop_debug) {
1782 tty->print_cr("Unprofitable");
1783 pk->at(0)->dump();
1784 }
1785 #endif
1786 remove_pack_at(i);
1787 changed = true;
1788 }
1789 }
1790 } while (changed);
1791
1792 #ifndef PRODUCT
1793 if (TraceSuperWord) {
1794 tty->print_cr("\nAfter filter_packs");
1795 print_packset();
1796 tty->cr();
1797 }
1798 #endif
1799 }
1800
1801 //------------------------------merge_packs_to_cmovd---------------------------
1802 // Merge CMoveD into new vector-nodes
1803 // We want to catch this pattern and subsume CmpD and Bool into CMoveD
1804 //
1805 // SubD ConD
1806 // / | /
1807 // / | / /
1808 // / | / /
1809 // / | / /
1810 // / / /
1811 // / / | /
1812 // v / | /
1813 // CmpD | /
1814 // | | /
1815 // v | /
1816 // Bool | /
1817 // \ | /
1818 // \ | /
1819 // \ | /
1820 // \ | /
1821 // \ v /
1822 // CMoveD
1823 //
1824
merge_packs_to_cmovd()1825 void SuperWord::merge_packs_to_cmovd() {
1826 for (int i = _packset.length() - 1; i >= 0; i--) {
1827 _cmovev_kit.make_cmovevd_pack(_packset.at(i));
1828 }
1829 #ifndef PRODUCT
1830 if (TraceSuperWord) {
1831 tty->print_cr("\nSuperWord::merge_packs_to_cmovd(): After merge");
1832 print_packset();
1833 tty->cr();
1834 }
1835 #endif
1836 }
1837
is_Bool_candidate(Node * def) const1838 Node* CMoveKit::is_Bool_candidate(Node* def) const {
1839 Node* use = NULL;
1840 if (!def->is_Bool() || def->in(0) != NULL || def->outcnt() != 1) {
1841 return NULL;
1842 }
1843 for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1844 use = def->fast_out(j);
1845 if (!_sw->same_generation(def, use) || !use->is_CMove()) {
1846 return NULL;
1847 }
1848 }
1849 return use;
1850 }
1851
is_CmpD_candidate(Node * def) const1852 Node* CMoveKit::is_CmpD_candidate(Node* def) const {
1853 Node* use = NULL;
1854 if (!def->is_Cmp() || def->in(0) != NULL || def->outcnt() != 1) {
1855 return NULL;
1856 }
1857 for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1858 use = def->fast_out(j);
1859 if (!_sw->same_generation(def, use) || (use = is_Bool_candidate(use)) == NULL || !_sw->same_generation(def, use)) {
1860 return NULL;
1861 }
1862 }
1863 return use;
1864 }
1865
make_cmovevd_pack(Node_List * cmovd_pk)1866 Node_List* CMoveKit::make_cmovevd_pack(Node_List* cmovd_pk) {
1867 Node *cmovd = cmovd_pk->at(0);
1868 if (!cmovd->is_CMove()) {
1869 return NULL;
1870 }
1871 if (cmovd->Opcode() != Op_CMoveF && cmovd->Opcode() != Op_CMoveD) {
1872 return NULL;
1873 }
1874 if (pack(cmovd) != NULL) { // already in the cmov pack
1875 return NULL;
1876 }
1877 if (cmovd->in(0) != NULL) {
1878 NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CMoveD %d has control flow, escaping...", cmovd->_idx); cmovd->dump();})
1879 return NULL;
1880 }
1881
1882 Node* bol = cmovd->as_CMove()->in(CMoveNode::Condition);
1883 if (!bol->is_Bool()
1884 || bol->outcnt() != 1
1885 || !_sw->same_generation(bol, cmovd)
1886 || bol->in(0) != NULL // BoolNode has control flow!!
1887 || _sw->my_pack(bol) == NULL) {
1888 NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: Bool %d does not fit CMoveD %d for building vector, escaping...", bol->_idx, cmovd->_idx); bol->dump();})
1889 return NULL;
1890 }
1891 Node_List* bool_pk = _sw->my_pack(bol);
1892 if (bool_pk->size() != cmovd_pk->size() ) {
1893 return NULL;
1894 }
1895
1896 Node* cmpd = bol->in(1);
1897 if (!cmpd->is_Cmp()
1898 || cmpd->outcnt() != 1
1899 || !_sw->same_generation(cmpd, cmovd)
1900 || cmpd->in(0) != NULL // CmpDNode has control flow!!
1901 || _sw->my_pack(cmpd) == NULL) {
1902 NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CmpD %d does not fit CMoveD %d for building vector, escaping...", cmpd->_idx, cmovd->_idx); cmpd->dump();})
1903 return NULL;
1904 }
1905 Node_List* cmpd_pk = _sw->my_pack(cmpd);
1906 if (cmpd_pk->size() != cmovd_pk->size() ) {
1907 return NULL;
1908 }
1909
1910 if (!test_cmpd_pack(cmpd_pk, cmovd_pk)) {
1911 NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: cmpd pack for CmpD %d failed vectorization test", cmpd->_idx); cmpd->dump();})
1912 return NULL;
1913 }
1914
1915 Node_List* new_cmpd_pk = new Node_List();
1916 uint sz = cmovd_pk->size() - 1;
1917 for (uint i = 0; i <= sz; ++i) {
1918 Node* cmov = cmovd_pk->at(i);
1919 Node* bol = bool_pk->at(i);
1920 Node* cmp = cmpd_pk->at(i);
1921
1922 new_cmpd_pk->insert(i, cmov);
1923
1924 map(cmov, new_cmpd_pk);
1925 map(bol, new_cmpd_pk);
1926 map(cmp, new_cmpd_pk);
1927
1928 _sw->set_my_pack(cmov, new_cmpd_pk); // and keep old packs for cmp and bool
1929 }
1930 _sw->_packset.remove(cmovd_pk);
1931 _sw->_packset.remove(bool_pk);
1932 _sw->_packset.remove(cmpd_pk);
1933 _sw->_packset.append(new_cmpd_pk);
1934 NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print_cr("CMoveKit::make_cmovevd_pack: added syntactic CMoveD pack"); _sw->print_pack(new_cmpd_pk);})
1935 return new_cmpd_pk;
1936 }
1937
test_cmpd_pack(Node_List * cmpd_pk,Node_List * cmovd_pk)1938 bool CMoveKit::test_cmpd_pack(Node_List* cmpd_pk, Node_List* cmovd_pk) {
1939 Node* cmpd0 = cmpd_pk->at(0);
1940 assert(cmpd0->is_Cmp(), "CMoveKit::test_cmpd_pack: should be CmpDNode");
1941 assert(cmovd_pk->at(0)->is_CMove(), "CMoveKit::test_cmpd_pack: should be CMoveD");
1942 assert(cmpd_pk->size() == cmovd_pk->size(), "CMoveKit::test_cmpd_pack: should be same size");
1943 Node* in1 = cmpd0->in(1);
1944 Node* in2 = cmpd0->in(2);
1945 Node_List* in1_pk = _sw->my_pack(in1);
1946 Node_List* in2_pk = _sw->my_pack(in2);
1947
1948 if ( (in1_pk != NULL && in1_pk->size() != cmpd_pk->size())
1949 || (in2_pk != NULL && in2_pk->size() != cmpd_pk->size()) ) {
1950 return false;
1951 }
1952
1953 // test if "all" in1 are in the same pack or the same node
1954 if (in1_pk == NULL) {
1955 for (uint j = 1; j < cmpd_pk->size(); j++) {
1956 if (cmpd_pk->at(j)->in(1) != in1) {
1957 return false;
1958 }
1959 }//for: in1_pk is not pack but all CmpD nodes in the pack have the same in(1)
1960 }
1961 // test if "all" in2 are in the same pack or the same node
1962 if (in2_pk == NULL) {
1963 for (uint j = 1; j < cmpd_pk->size(); j++) {
1964 if (cmpd_pk->at(j)->in(2) != in2) {
1965 return false;
1966 }
1967 }//for: in2_pk is not pack but all CmpD nodes in the pack have the same in(2)
1968 }
1969 //now check if cmpd_pk may be subsumed in vector built for cmovd_pk
1970 int cmovd_ind1, cmovd_ind2;
1971 if (cmpd_pk->at(0)->in(1) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
1972 && cmpd_pk->at(0)->in(2) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
1973 cmovd_ind1 = CMoveNode::IfFalse;
1974 cmovd_ind2 = CMoveNode::IfTrue;
1975 } else if (cmpd_pk->at(0)->in(2) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
1976 && cmpd_pk->at(0)->in(1) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
1977 cmovd_ind2 = CMoveNode::IfFalse;
1978 cmovd_ind1 = CMoveNode::IfTrue;
1979 }
1980 else {
1981 return false;
1982 }
1983
1984 for (uint j = 1; j < cmpd_pk->size(); j++) {
1985 if (cmpd_pk->at(j)->in(1) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind1)
1986 || cmpd_pk->at(j)->in(2) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind2)) {
1987 return false;
1988 }//if
1989 }
1990 NOT_PRODUCT(if(_sw->is_trace_cmov()) { tty->print("CMoveKit::test_cmpd_pack: cmpd pack for 1st CmpD %d is OK for vectorization: ", cmpd0->_idx); cmpd0->dump(); })
1991 return true;
1992 }
1993
1994 //------------------------------implemented---------------------------
1995 // Can code be generated for pack p?
implemented(Node_List * p)1996 bool SuperWord::implemented(Node_List* p) {
1997 bool retValue = false;
1998 Node* p0 = p->at(0);
1999 if (p0 != NULL) {
2000 int opc = p0->Opcode();
2001 uint size = p->size();
2002 if (p0->is_reduction()) {
2003 const Type *arith_type = p0->bottom_type();
2004 // Length 2 reductions of INT/LONG do not offer performance benefits
2005 if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
2006 retValue = false;
2007 } else {
2008 retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
2009 }
2010 } else {
2011 retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
2012 }
2013 if (!retValue) {
2014 if (is_cmov_pack(p)) {
2015 NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::implemented: found cmpd pack"); print_pack(p);})
2016 return true;
2017 }
2018 }
2019 }
2020 return retValue;
2021 }
2022
is_cmov_pack(Node_List * p)2023 bool SuperWord::is_cmov_pack(Node_List* p) {
2024 return _cmovev_kit.pack(p->at(0)) != NULL;
2025 }
2026 //------------------------------same_inputs--------------------------
2027 // For pack p, are all idx operands the same?
same_inputs(Node_List * p,int idx)2028 bool SuperWord::same_inputs(Node_List* p, int idx) {
2029 Node* p0 = p->at(0);
2030 uint vlen = p->size();
2031 Node* p0_def = p0->in(idx);
2032 for (uint i = 1; i < vlen; i++) {
2033 Node* pi = p->at(i);
2034 Node* pi_def = pi->in(idx);
2035 if (p0_def != pi_def) {
2036 return false;
2037 }
2038 }
2039 return true;
2040 }
2041
2042 //------------------------------profitable---------------------------
2043 // For pack p, are all operands and all uses (with in the block) vector?
profitable(Node_List * p)2044 bool SuperWord::profitable(Node_List* p) {
2045 Node* p0 = p->at(0);
2046 uint start, end;
2047 VectorNode::vector_operands(p0, &start, &end);
2048
2049 // Return false if some inputs are not vectors or vectors with different
2050 // size or alignment.
2051 // Also, for now, return false if not scalar promotion case when inputs are
2052 // the same. Later, implement PackNode and allow differing, non-vector inputs
2053 // (maybe just the ones from outside the block.)
2054 for (uint i = start; i < end; i++) {
2055 if (!is_vector_use(p0, i)) {
2056 return false;
2057 }
2058 }
2059 // Check if reductions are connected
2060 if (p0->is_reduction()) {
2061 Node* second_in = p0->in(2);
2062 Node_List* second_pk = my_pack(second_in);
2063 if ((second_pk == NULL) || (_num_work_vecs == _num_reductions)) {
2064 // Remove reduction flag if no parent pack or if not enough work
2065 // to cover reduction expansion overhead
2066 p0->remove_flag(Node::Flag_is_reduction);
2067 return false;
2068 } else if (second_pk->size() != p->size()) {
2069 return false;
2070 }
2071 }
2072 if (VectorNode::is_shift(p0)) {
2073 // For now, return false if shift count is vector or not scalar promotion
2074 // case (different shift counts) because it is not supported yet.
2075 Node* cnt = p0->in(2);
2076 Node_List* cnt_pk = my_pack(cnt);
2077 if (cnt_pk != NULL)
2078 return false;
2079 if (!same_inputs(p, 2))
2080 return false;
2081 }
2082 if (!p0->is_Store()) {
2083 // For now, return false if not all uses are vector.
2084 // Later, implement ExtractNode and allow non-vector uses (maybe
2085 // just the ones outside the block.)
2086 for (uint i = 0; i < p->size(); i++) {
2087 Node* def = p->at(i);
2088 if (is_cmov_pack_internal_node(p, def)) {
2089 continue;
2090 }
2091 for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
2092 Node* use = def->fast_out(j);
2093 for (uint k = 0; k < use->req(); k++) {
2094 Node* n = use->in(k);
2095 if (def == n) {
2096 // Reductions should only have a Phi use at the loop head or a non-phi use
2097 // outside of the loop if it is the last element of the pack (e.g. SafePoint).
2098 if (def->is_reduction() &&
2099 ((use->is_Phi() && use->in(0) == _lpt->_head) ||
2100 (!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))) && i == p->size()-1))) {
2101 continue;
2102 }
2103 if (!is_vector_use(use, k)) {
2104 return false;
2105 }
2106 }
2107 }
2108 }
2109 }
2110 }
2111 return true;
2112 }
2113
2114 //------------------------------schedule---------------------------
2115 // Adjust the memory graph for the packed operations
schedule()2116 void SuperWord::schedule() {
2117
2118 // Co-locate in the memory graph the members of each memory pack
2119 for (int i = 0; i < _packset.length(); i++) {
2120 co_locate_pack(_packset.at(i));
2121 }
2122 }
2123
2124 //-------------------------------remove_and_insert-------------------
2125 // Remove "current" from its current position in the memory graph and insert
2126 // it after the appropriate insertion point (lip or uip).
remove_and_insert(MemNode * current,MemNode * prev,MemNode * lip,Node * uip,Unique_Node_List & sched_before)2127 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
2128 Node *uip, Unique_Node_List &sched_before) {
2129 Node* my_mem = current->in(MemNode::Memory);
2130 bool sched_up = sched_before.member(current);
2131
2132 // remove current_store from its current position in the memmory graph
2133 for (DUIterator i = current->outs(); current->has_out(i); i++) {
2134 Node* use = current->out(i);
2135 if (use->is_Mem()) {
2136 assert(use->in(MemNode::Memory) == current, "must be");
2137 if (use == prev) { // connect prev to my_mem
2138 _igvn.replace_input_of(use, MemNode::Memory, my_mem);
2139 --i; //deleted this edge; rescan position
2140 } else if (sched_before.member(use)) {
2141 if (!sched_up) { // Will be moved together with current
2142 _igvn.replace_input_of(use, MemNode::Memory, uip);
2143 --i; //deleted this edge; rescan position
2144 }
2145 } else {
2146 if (sched_up) { // Will be moved together with current
2147 _igvn.replace_input_of(use, MemNode::Memory, lip);
2148 --i; //deleted this edge; rescan position
2149 }
2150 }
2151 }
2152 }
2153
2154 Node *insert_pt = sched_up ? uip : lip;
2155
2156 // all uses of insert_pt's memory state should use current's instead
2157 for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
2158 Node* use = insert_pt->out(i);
2159 if (use->is_Mem()) {
2160 assert(use->in(MemNode::Memory) == insert_pt, "must be");
2161 _igvn.replace_input_of(use, MemNode::Memory, current);
2162 --i; //deleted this edge; rescan position
2163 } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
2164 uint pos; //lip (lower insert point) must be the last one in the memory slice
2165 for (pos=1; pos < use->req(); pos++) {
2166 if (use->in(pos) == insert_pt) break;
2167 }
2168 _igvn.replace_input_of(use, pos, current);
2169 --i;
2170 }
2171 }
2172
2173 //connect current to insert_pt
2174 _igvn.replace_input_of(current, MemNode::Memory, insert_pt);
2175 }
2176
2177 //------------------------------co_locate_pack----------------------------------
2178 // To schedule a store pack, we need to move any sandwiched memory ops either before
2179 // or after the pack, based upon dependence information:
2180 // (1) If any store in the pack depends on the sandwiched memory op, the
2181 // sandwiched memory op must be scheduled BEFORE the pack;
2182 // (2) If a sandwiched memory op depends on any store in the pack, the
2183 // sandwiched memory op must be scheduled AFTER the pack;
2184 // (3) If a sandwiched memory op (say, memA) depends on another sandwiched
2185 // memory op (say memB), memB must be scheduled before memA. So, if memA is
2186 // scheduled before the pack, memB must also be scheduled before the pack;
2187 // (4) If there is no dependence restriction for a sandwiched memory op, we simply
2188 // schedule this store AFTER the pack
2189 // (5) We know there is no dependence cycle, so there in no other case;
2190 // (6) Finally, all memory ops in another single pack should be moved in the same direction.
2191 //
2192 // To schedule a load pack, we use the memory state of either the first or the last load in
2193 // the pack, based on the dependence constraint.
co_locate_pack(Node_List * pk)2194 void SuperWord::co_locate_pack(Node_List* pk) {
2195 if (pk->at(0)->is_Store()) {
2196 MemNode* first = executed_first(pk)->as_Mem();
2197 MemNode* last = executed_last(pk)->as_Mem();
2198 Unique_Node_List schedule_before_pack;
2199 Unique_Node_List memops;
2200
2201 MemNode* current = last->in(MemNode::Memory)->as_Mem();
2202 MemNode* previous = last;
2203 while (true) {
2204 assert(in_bb(current), "stay in block");
2205 memops.push(previous);
2206 for (DUIterator i = current->outs(); current->has_out(i); i++) {
2207 Node* use = current->out(i);
2208 if (use->is_Mem() && use != previous)
2209 memops.push(use);
2210 }
2211 if (current == first) break;
2212 previous = current;
2213 current = current->in(MemNode::Memory)->as_Mem();
2214 }
2215
2216 // determine which memory operations should be scheduled before the pack
2217 for (uint i = 1; i < memops.size(); i++) {
2218 Node *s1 = memops.at(i);
2219 if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
2220 for (uint j = 0; j< i; j++) {
2221 Node *s2 = memops.at(j);
2222 if (!independent(s1, s2)) {
2223 if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
2224 schedule_before_pack.push(s1); // s1 must be scheduled before
2225 Node_List* mem_pk = my_pack(s1);
2226 if (mem_pk != NULL) {
2227 for (uint ii = 0; ii < mem_pk->size(); ii++) {
2228 Node* s = mem_pk->at(ii); // follow partner
2229 if (memops.member(s) && !schedule_before_pack.member(s))
2230 schedule_before_pack.push(s);
2231 }
2232 }
2233 break;
2234 }
2235 }
2236 }
2237 }
2238 }
2239
2240 Node* upper_insert_pt = first->in(MemNode::Memory);
2241 // Following code moves loads connected to upper_insert_pt below aliased stores.
2242 // Collect such loads here and reconnect them back to upper_insert_pt later.
2243 memops.clear();
2244 for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
2245 Node* use = upper_insert_pt->out(i);
2246 if (use->is_Mem() && !use->is_Store()) {
2247 memops.push(use);
2248 }
2249 }
2250
2251 MemNode* lower_insert_pt = last;
2252 previous = last; //previous store in pk
2253 current = last->in(MemNode::Memory)->as_Mem();
2254
2255 // start scheduling from "last" to "first"
2256 while (true) {
2257 assert(in_bb(current), "stay in block");
2258 assert(in_pack(previous, pk), "previous stays in pack");
2259 Node* my_mem = current->in(MemNode::Memory);
2260
2261 if (in_pack(current, pk)) {
2262 // Forward users of my memory state (except "previous) to my input memory state
2263 for (DUIterator i = current->outs(); current->has_out(i); i++) {
2264 Node* use = current->out(i);
2265 if (use->is_Mem() && use != previous) {
2266 assert(use->in(MemNode::Memory) == current, "must be");
2267 if (schedule_before_pack.member(use)) {
2268 _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
2269 } else {
2270 _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
2271 }
2272 --i; // deleted this edge; rescan position
2273 }
2274 }
2275 previous = current;
2276 } else { // !in_pack(current, pk) ==> a sandwiched store
2277 remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack);
2278 }
2279
2280 if (current == first) break;
2281 current = my_mem->as_Mem();
2282 } // end while
2283
2284 // Reconnect loads back to upper_insert_pt.
2285 for (uint i = 0; i < memops.size(); i++) {
2286 Node *ld = memops.at(i);
2287 if (ld->in(MemNode::Memory) != upper_insert_pt) {
2288 _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
2289 }
2290 }
2291 } else if (pk->at(0)->is_Load()) { // Load pack
2292 // All loads in the pack should have the same memory state. By default,
2293 // we use the memory state of the last load. However, if any load could
2294 // not be moved down due to the dependence constraint, we use the memory
2295 // state of the first load.
2296 Node* mem_input = pick_mem_state(pk);
2297 _igvn.hash_delete(mem_input);
2298 // Give each load the same memory state
2299 for (uint i = 0; i < pk->size(); i++) {
2300 LoadNode* ld = pk->at(i)->as_Load();
2301 _igvn.replace_input_of(ld, MemNode::Memory, mem_input);
2302 }
2303 }
2304 }
2305
2306 // Finds the first and last memory state and then picks either of them by checking dependence constraints.
2307 // If a store is dependent on an earlier load then we need to pick the memory state of the first load and cannot
2308 // pick the memory state of the last load.
pick_mem_state(Node_List * pk)2309 Node* SuperWord::pick_mem_state(Node_List* pk) {
2310 Node* first_mem = find_first_mem_state(pk);
2311 Node* last_mem = find_last_mem_state(pk, first_mem);
2312
2313 for (uint i = 0; i < pk->size(); i++) {
2314 Node* ld = pk->at(i);
2315 for (Node* current = last_mem; current != ld->in(MemNode::Memory); current = current->in(MemNode::Memory)) {
2316 assert(current->is_Mem() && in_bb(current), "unexpected memory");
2317 assert(current != first_mem, "corrupted memory graph");
2318 if (!independent(current, ld)) {
2319 #ifdef ASSERT
2320 // Added assertion code since no case has been observed that should pick the first memory state.
2321 // Remove the assertion code whenever we find a (valid) case that really needs the first memory state.
2322 pk->dump();
2323 first_mem->dump();
2324 last_mem->dump();
2325 current->dump();
2326 ld->dump();
2327 ld->in(MemNode::Memory)->dump();
2328 assert(false, "never observed that first memory should be picked");
2329 #endif
2330 return first_mem; // A later store depends on this load, pick memory state of first load
2331 }
2332 }
2333 }
2334 return last_mem;
2335 }
2336
2337 // Walk the memory graph from the current first load until the
2338 // start of the loop and check if nodes on the way are memory
2339 // edges of loads in the pack. The last one we encounter is the
2340 // first load.
find_first_mem_state(Node_List * pk)2341 Node* SuperWord::find_first_mem_state(Node_List* pk) {
2342 Node* first_mem = pk->at(0)->in(MemNode::Memory);
2343 for (Node* current = first_mem; in_bb(current); current = current->is_Phi() ? current->in(LoopNode::EntryControl) : current->in(MemNode::Memory)) {
2344 assert(current->is_Mem() || (current->is_Phi() && current->in(0) == bb()), "unexpected memory");
2345 for (uint i = 1; i < pk->size(); i++) {
2346 Node* ld = pk->at(i);
2347 if (ld->in(MemNode::Memory) == current) {
2348 first_mem = current;
2349 break;
2350 }
2351 }
2352 }
2353 return first_mem;
2354 }
2355
2356 // Find the last load by going over the pack again and walking
2357 // the memory graph from the loads of the pack to the memory of
2358 // the first load. If we encounter the memory of the current last
2359 // load, then we started from further down in the memory graph and
2360 // the load we started from is the last load.
find_last_mem_state(Node_List * pk,Node * first_mem)2361 Node* SuperWord::find_last_mem_state(Node_List* pk, Node* first_mem) {
2362 Node* last_mem = pk->at(0)->in(MemNode::Memory);
2363 for (uint i = 0; i < pk->size(); i++) {
2364 Node* ld = pk->at(i);
2365 for (Node* current = ld->in(MemNode::Memory); current != first_mem; current = current->in(MemNode::Memory)) {
2366 assert(current->is_Mem() && in_bb(current), "unexpected memory");
2367 if (current->in(MemNode::Memory) == last_mem) {
2368 last_mem = ld->in(MemNode::Memory);
2369 }
2370 }
2371 }
2372 return last_mem;
2373 }
2374
2375 #ifndef PRODUCT
print_loop(bool whole)2376 void SuperWord::print_loop(bool whole) {
2377 Node_Stack stack(_arena, _phase->C->unique() >> 2);
2378 Node_List rpo_list;
2379 VectorSet visited(_arena);
2380 visited.set(lpt()->_head->_idx);
2381 _phase->rpo(lpt()->_head, stack, visited, rpo_list);
2382 _phase->dump(lpt(), rpo_list.size(), rpo_list );
2383 if(whole) {
2384 tty->print_cr("\n Whole loop tree");
2385 _phase->dump();
2386 tty->print_cr(" End of whole loop tree\n");
2387 }
2388 }
2389 #endif
2390
2391 //------------------------------output---------------------------
2392 // Convert packs into vector node operations
output()2393 void SuperWord::output() {
2394 CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2395 Compile* C = _phase->C;
2396 if (_packset.length() == 0) {
2397 if (cl->is_main_loop()) {
2398 // Instigate more unrolling for optimization when vectorization fails.
2399 C->set_major_progress();
2400 cl->set_notpassed_slp();
2401 cl->mark_do_unroll_only();
2402 }
2403 return;
2404 }
2405
2406 #ifndef PRODUCT
2407 if (TraceLoopOpts) {
2408 tty->print("SuperWord::output ");
2409 lpt()->dump_head();
2410 }
2411 #endif
2412
2413 if (cl->is_main_loop()) {
2414 // MUST ENSURE main loop's initial value is properly aligned:
2415 // (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
2416
2417 align_initial_loop_index(align_to_ref());
2418
2419 // Insert extract (unpack) operations for scalar uses
2420 for (int i = 0; i < _packset.length(); i++) {
2421 insert_extracts(_packset.at(i));
2422 }
2423 }
2424
2425 uint max_vlen_in_bytes = 0;
2426 uint max_vlen = 0;
2427 bool can_process_post_loop = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
2428
2429 NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
2430
2431 CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
2432
2433 NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop after create_reserve_version_of_loop"); print_loop(true);})
2434
2435 if (do_reserve_copy() && !make_reversable.has_reserved()) {
2436 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: loop was not reserved correctly, exiting SuperWord");})
2437 return;
2438 }
2439
2440 for (int i = 0; i < _block.length(); i++) {
2441 Node* n = _block.at(i);
2442 Node_List* p = my_pack(n);
2443 if (p && n == executed_last(p)) {
2444 uint vlen = p->size();
2445 uint vlen_in_bytes = 0;
2446 Node* vn = NULL;
2447 Node* low_adr = p->at(0);
2448 Node* first = executed_first(p);
2449 if (can_process_post_loop) {
2450 // override vlen with the main loops vector length
2451 vlen = cl->slp_max_unroll();
2452 }
2453 NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
2454 int opc = n->Opcode();
2455 if (n->is_Load()) {
2456 Node* ctl = n->in(MemNode::Control);
2457 Node* mem = first->in(MemNode::Memory);
2458 SWPointer p1(n->as_Mem(), this, NULL, false);
2459 // Identify the memory dependency for the new loadVector node by
2460 // walking up through memory chain.
2461 // This is done to give flexibility to the new loadVector node so that
2462 // it can move above independent storeVector nodes.
2463 while (mem->is_StoreVector()) {
2464 SWPointer p2(mem->as_Mem(), this, NULL, false);
2465 int cmp = p1.cmp(p2);
2466 if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
2467 mem = mem->in(MemNode::Memory);
2468 } else {
2469 break; // dependent memory
2470 }
2471 }
2472 Node* adr = low_adr->in(MemNode::Address);
2473 const TypePtr* atyp = n->adr_type();
2474 vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
2475 vlen_in_bytes = vn->as_LoadVector()->memory_size();
2476 } else if (n->is_Store()) {
2477 // Promote value to be stored to vector
2478 Node* val = vector_opd(p, MemNode::ValueIn);
2479 if (val == NULL) {
2480 if (do_reserve_copy()) {
2481 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: val should not be NULL, exiting SuperWord");})
2482 return; //and reverse to backup IG
2483 }
2484 ShouldNotReachHere();
2485 }
2486
2487 Node* ctl = n->in(MemNode::Control);
2488 Node* mem = first->in(MemNode::Memory);
2489 Node* adr = low_adr->in(MemNode::Address);
2490 const TypePtr* atyp = n->adr_type();
2491 vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
2492 vlen_in_bytes = vn->as_StoreVector()->memory_size();
2493 } else if (VectorNode::is_scalar_rotate(n)) {
2494 Node* in1 = low_adr->in(1);
2495 Node* in2 = p->at(0)->in(2);
2496 assert(in2->bottom_type()->isa_int(), "Shift must always be an int value");
2497 // If rotation count is non-constant or greater than 8bit value create a vector.
2498 if (!in2->is_Con() || -0x80 > in2->get_int() || in2->get_int() >= 0x80) {
2499 in2 = vector_opd(p, 2);
2500 }
2501 vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2502 vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2503 } else if (VectorNode::is_roundopD(n)) {
2504 Node* in1 = vector_opd(p, 1);
2505 Node* in2 = low_adr->in(2);
2506 assert(in2->is_Con(), "Constant rounding mode expected.");
2507 vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2508 vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2509 } else if (VectorNode::is_muladds2i(n)) {
2510 assert(n->req() == 5u, "MulAddS2I should have 4 operands.");
2511 Node* in1 = vector_opd(p, 1);
2512 Node* in2 = vector_opd(p, 2);
2513 vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2514 vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2515 } else if (n->req() == 3 && !is_cmov_pack(p)) {
2516 // Promote operands to vector
2517 Node* in1 = NULL;
2518 bool node_isa_reduction = n->is_reduction();
2519 if (node_isa_reduction) {
2520 // the input to the first reduction operation is retained
2521 in1 = low_adr->in(1);
2522 } else {
2523 in1 = vector_opd(p, 1);
2524 if (in1 == NULL) {
2525 if (do_reserve_copy()) {
2526 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: in1 should not be NULL, exiting SuperWord");})
2527 return; //and reverse to backup IG
2528 }
2529 ShouldNotReachHere();
2530 }
2531 }
2532 Node* in2 = vector_opd(p, 2);
2533 if (in2 == NULL) {
2534 if (do_reserve_copy()) {
2535 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: in2 should not be NULL, exiting SuperWord");})
2536 return; //and reverse to backup IG
2537 }
2538 ShouldNotReachHere();
2539 }
2540 if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
2541 // Move invariant vector input into second position to avoid register spilling.
2542 Node* tmp = in1;
2543 in1 = in2;
2544 in2 = tmp;
2545 }
2546 if (node_isa_reduction) {
2547 const Type *arith_type = n->bottom_type();
2548 vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
2549 if (in2->is_Load()) {
2550 vlen_in_bytes = in2->as_LoadVector()->memory_size();
2551 } else {
2552 vlen_in_bytes = in2->as_Vector()->length_in_bytes();
2553 }
2554 } else {
2555 vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2556 vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2557 }
2558 } else if (opc == Op_SqrtF || opc == Op_SqrtD ||
2559 opc == Op_AbsF || opc == Op_AbsD ||
2560 opc == Op_AbsI || opc == Op_AbsL ||
2561 opc == Op_NegF || opc == Op_NegD ||
2562 opc == Op_PopCountI) {
2563 assert(n->req() == 2, "only one input expected");
2564 Node* in = vector_opd(p, 1);
2565 vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
2566 vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2567 } else if (is_cmov_pack(p)) {
2568 if (can_process_post_loop) {
2569 // do not refactor of flow in post loop context
2570 return;
2571 }
2572 if (!n->is_CMove()) {
2573 continue;
2574 }
2575 // place here CMoveVDNode
2576 NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
2577 Node* bol = n->in(CMoveNode::Condition);
2578 if (!bol->is_Bool() && bol->Opcode() == Op_ExtractI && bol->req() > 1 ) {
2579 NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d is not Bool node, trying its in(1) node %d", bol->_idx, bol->in(1)->_idx); bol->dump(); bol->in(1)->dump();})
2580 bol = bol->in(1); //may be ExtractNode
2581 }
2582
2583 assert(bol->is_Bool(), "should be BoolNode - too late to bail out!");
2584 if (!bol->is_Bool()) {
2585 if (do_reserve_copy()) {
2586 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: expected %d bool node, exiting SuperWord", bol->_idx); bol->dump();})
2587 return; //and reverse to backup IG
2588 }
2589 ShouldNotReachHere();
2590 }
2591
2592 int cond = (int)bol->as_Bool()->_test._test;
2593 Node* in_cc = _igvn.intcon(cond);
2594 NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created intcon in_cc node %d", in_cc->_idx); in_cc->dump();})
2595 Node* cc = bol->clone();
2596 cc->set_req(1, in_cc);
2597 NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created bool cc node %d", cc->_idx); cc->dump();})
2598
2599 Node* src1 = vector_opd(p, 2); //2=CMoveNode::IfFalse
2600 if (src1 == NULL) {
2601 if (do_reserve_copy()) {
2602 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src1 should not be NULL, exiting SuperWord");})
2603 return; //and reverse to backup IG
2604 }
2605 ShouldNotReachHere();
2606 }
2607 Node* src2 = vector_opd(p, 3); //3=CMoveNode::IfTrue
2608 if (src2 == NULL) {
2609 if (do_reserve_copy()) {
2610 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src2 should not be NULL, exiting SuperWord");})
2611 return; //and reverse to backup IG
2612 }
2613 ShouldNotReachHere();
2614 }
2615 BasicType bt = velt_basic_type(n);
2616 const TypeVect* vt = TypeVect::make(bt, vlen);
2617 assert(bt == T_FLOAT || bt == T_DOUBLE, "Only vectorization for FP cmovs is supported");
2618 if (bt == T_FLOAT) {
2619 vn = new CMoveVFNode(cc, src1, src2, vt);
2620 } else {
2621 assert(bt == T_DOUBLE, "Expected double");
2622 vn = new CMoveVDNode(cc, src1, src2, vt);
2623 }
2624 NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
2625 } else if (opc == Op_FmaD || opc == Op_FmaF) {
2626 // Promote operands to vector
2627 Node* in1 = vector_opd(p, 1);
2628 Node* in2 = vector_opd(p, 2);
2629 Node* in3 = vector_opd(p, 3);
2630 vn = VectorNode::make(opc, in1, in2, in3, vlen, velt_basic_type(n));
2631 vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2632 } else {
2633 if (do_reserve_copy()) {
2634 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");})
2635 return; //and reverse to backup IG
2636 }
2637 ShouldNotReachHere();
2638 }
2639
2640 assert(vn != NULL, "sanity");
2641 if (vn == NULL) {
2642 if (do_reserve_copy()){
2643 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: got NULL node, cannot proceed, exiting SuperWord");})
2644 return; //and reverse to backup IG
2645 }
2646 ShouldNotReachHere();
2647 }
2648
2649 _block.at_put(i, vn);
2650 _igvn.register_new_node_with_optimizer(vn);
2651 _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
2652 for (uint j = 0; j < p->size(); j++) {
2653 Node* pm = p->at(j);
2654 _igvn.replace_node(pm, vn);
2655 }
2656 _igvn._worklist.push(vn);
2657
2658 if (can_process_post_loop) {
2659 // first check if the vector size if the maximum vector which we can use on the machine,
2660 // other vector size have reduced values for predicated data mapping.
2661 if (vlen_in_bytes != (uint)MaxVectorSize) {
2662 return;
2663 }
2664 }
2665
2666 if (vlen > max_vlen) {
2667 max_vlen = vlen;
2668 }
2669 if (vlen_in_bytes > max_vlen_in_bytes) {
2670 max_vlen_in_bytes = vlen_in_bytes;
2671 }
2672 #ifdef ASSERT
2673 if (TraceNewVectors) {
2674 tty->print("new Vector node: ");
2675 vn->dump();
2676 }
2677 #endif
2678 }
2679 }//for (int i = 0; i < _block.length(); i++)
2680
2681 if (max_vlen_in_bytes > C->max_vector_size()) {
2682 C->set_max_vector_size(max_vlen_in_bytes);
2683 }
2684 if (max_vlen_in_bytes > 0) {
2685 cl->mark_loop_vectorized();
2686 }
2687
2688 if (SuperWordLoopUnrollAnalysis) {
2689 if (cl->has_passed_slp()) {
2690 uint slp_max_unroll_factor = cl->slp_max_unroll();
2691 if (slp_max_unroll_factor == max_vlen) {
2692 if (TraceSuperWordLoopUnrollAnalysis) {
2693 tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
2694 }
2695
2696 // For atomic unrolled loops which are vector mapped, instigate more unrolling
2697 cl->set_notpassed_slp();
2698 if (cl->is_main_loop()) {
2699 // if vector resources are limited, do not allow additional unrolling, also
2700 // do not unroll more on pure vector loops which were not reduced so that we can
2701 // program the post loop to single iteration execution.
2702 if (FLOATPRESSURE > 8) {
2703 C->set_major_progress();
2704 cl->mark_do_unroll_only();
2705 }
2706 }
2707
2708 if (do_reserve_copy()) {
2709 if (can_process_post_loop) {
2710 // Now create the difference of trip and limit and use it as our mask index.
2711 // Note: We limited the unroll of the vectorized loop so that
2712 // only vlen-1 size iterations can remain to be mask programmed.
2713 Node *incr = cl->incr();
2714 SubINode *index = new SubINode(cl->limit(), cl->init_trip());
2715 _igvn.register_new_node_with_optimizer(index);
2716 SetVectMaskINode *mask = new SetVectMaskINode(_phase->get_ctrl(cl->init_trip()), index);
2717 _igvn.register_new_node_with_optimizer(mask);
2718 // make this a single iteration loop
2719 AddINode *new_incr = new AddINode(incr->in(1), mask);
2720 _igvn.register_new_node_with_optimizer(new_incr);
2721 _phase->set_ctrl(new_incr, _phase->get_ctrl(incr));
2722 _igvn.replace_node(incr, new_incr);
2723 cl->mark_is_multiversioned();
2724 cl->loopexit()->add_flag(Node::Flag_has_vector_mask_set);
2725 }
2726 }
2727 }
2728 }
2729 }
2730
2731 if (do_reserve_copy()) {
2732 make_reversable.use_new();
2733 }
2734 NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
2735 return;
2736 }
2737
2738 //------------------------------vector_opd---------------------------
2739 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
vector_opd(Node_List * p,int opd_idx)2740 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
2741 Node* p0 = p->at(0);
2742 uint vlen = p->size();
2743 Node* opd = p0->in(opd_idx);
2744 CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2745
2746 if (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()) {
2747 // override vlen with the main loops vector length
2748 vlen = cl->slp_max_unroll();
2749 }
2750
2751 if (same_inputs(p, opd_idx)) {
2752 if (opd->is_Vector() || opd->is_LoadVector()) {
2753 assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
2754 if (opd_idx == 2 && VectorNode::is_shift(p0)) {
2755 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("shift's count can't be vector");})
2756 return NULL;
2757 }
2758 return opd; // input is matching vector
2759 }
2760 if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
2761 Compile* C = _phase->C;
2762 Node* cnt = opd;
2763 // Vector instructions do not mask shift count, do it here.
2764 juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
2765 const TypeInt* t = opd->find_int_type();
2766 if (t != NULL && t->is_con()) {
2767 juint shift = t->get_con();
2768 if (shift > mask) { // Unsigned cmp
2769 cnt = ConNode::make(TypeInt::make(shift & mask));
2770 }
2771 } else {
2772 if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) {
2773 cnt = ConNode::make(TypeInt::make(mask));
2774 _igvn.register_new_node_with_optimizer(cnt);
2775 cnt = new AndINode(opd, cnt);
2776 _igvn.register_new_node_with_optimizer(cnt);
2777 _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
2778 }
2779 assert(opd->bottom_type()->isa_int(), "int type only");
2780 if (!opd->bottom_type()->isa_int()) {
2781 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("Should be int type only");})
2782 return NULL;
2783 }
2784 }
2785 // Move shift count into vector register.
2786 cnt = VectorNode::shift_count(p0->Opcode(), cnt, vlen, velt_basic_type(p0));
2787 _igvn.register_new_node_with_optimizer(cnt);
2788 _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
2789 return cnt;
2790 }
2791 assert(!opd->is_StoreVector(), "such vector is not expected here");
2792 if (opd->is_StoreVector()) {
2793 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("StoreVector is not expected here");})
2794 return NULL;
2795 }
2796 // Convert scalar input to vector with the same number of elements as
2797 // p0's vector. Use p0's type because size of operand's container in
2798 // vector should match p0's size regardless operand's size.
2799 const Type* p0_t = NULL;
2800 VectorNode* vn = NULL;
2801 if (opd_idx == 2 && VectorNode::is_scalar_rotate(p0)) {
2802 Node* conv = opd;
2803 p0_t = TypeInt::INT;
2804 if (p0->bottom_type()->isa_long()) {
2805 p0_t = TypeLong::LONG;
2806 conv = new ConvI2LNode(opd);
2807 _igvn.register_new_node_with_optimizer(conv);
2808 _phase->set_ctrl(conv, _phase->get_ctrl(opd));
2809 }
2810 vn = VectorNode::scalar2vector(conv, vlen, p0_t);
2811 } else {
2812 p0_t = velt_type(p0);
2813 vn = VectorNode::scalar2vector(opd, vlen, p0_t);
2814 }
2815
2816 _igvn.register_new_node_with_optimizer(vn);
2817 _phase->set_ctrl(vn, _phase->get_ctrl(opd));
2818 #ifdef ASSERT
2819 if (TraceNewVectors) {
2820 tty->print("new Vector node: ");
2821 vn->dump();
2822 }
2823 #endif
2824 return vn;
2825 }
2826
2827 // Insert pack operation
2828 BasicType bt = velt_basic_type(p0);
2829 PackNode* pk = PackNode::make(opd, vlen, bt);
2830 DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
2831
2832 for (uint i = 1; i < vlen; i++) {
2833 Node* pi = p->at(i);
2834 Node* in = pi->in(opd_idx);
2835 assert(my_pack(in) == NULL, "Should already have been unpacked");
2836 if (my_pack(in) != NULL) {
2837 NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("Should already have been unpacked");})
2838 return NULL;
2839 }
2840 assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
2841 pk->add_opd(in);
2842 if (VectorNode::is_muladds2i(pi)) {
2843 Node* in2 = pi->in(opd_idx + 2);
2844 assert(my_pack(in2) == NULL, "Should already have been unpacked");
2845 if (my_pack(in2) != NULL) {
2846 NOT_PRODUCT(if (is_trace_loop_reverse() || TraceLoopOpts) { tty->print_cr("Should already have been unpacked"); })
2847 return NULL;
2848 }
2849 assert(opd_bt == in2->bottom_type()->basic_type(), "all same type");
2850 pk->add_opd(in2);
2851 }
2852 }
2853 _igvn.register_new_node_with_optimizer(pk);
2854 _phase->set_ctrl(pk, _phase->get_ctrl(opd));
2855 #ifdef ASSERT
2856 if (TraceNewVectors) {
2857 tty->print("new Vector node: ");
2858 pk->dump();
2859 }
2860 #endif
2861 return pk;
2862 }
2863
2864 //------------------------------insert_extracts---------------------------
2865 // If a use of pack p is not a vector use, then replace the
2866 // use with an extract operation.
insert_extracts(Node_List * p)2867 void SuperWord::insert_extracts(Node_List* p) {
2868 if (p->at(0)->is_Store()) return;
2869 assert(_n_idx_list.is_empty(), "empty (node,index) list");
2870
2871 // Inspect each use of each pack member. For each use that is
2872 // not a vector use, replace the use with an extract operation.
2873
2874 for (uint i = 0; i < p->size(); i++) {
2875 Node* def = p->at(i);
2876 for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
2877 Node* use = def->fast_out(j);
2878 for (uint k = 0; k < use->req(); k++) {
2879 Node* n = use->in(k);
2880 if (def == n) {
2881 Node_List* u_pk = my_pack(use);
2882 if ((u_pk == NULL || !is_cmov_pack(u_pk) || use->is_CMove()) && !is_vector_use(use, k)) {
2883 _n_idx_list.push(use, k);
2884 }
2885 }
2886 }
2887 }
2888 }
2889
2890 while (_n_idx_list.is_nonempty()) {
2891 Node* use = _n_idx_list.node();
2892 int idx = _n_idx_list.index();
2893 _n_idx_list.pop();
2894 Node* def = use->in(idx);
2895
2896 if (def->is_reduction()) continue;
2897
2898 // Insert extract operation
2899 _igvn.hash_delete(def);
2900 int def_pos = alignment(def) / data_size(def);
2901
2902 Node* ex = ExtractNode::make(def, def_pos, velt_basic_type(def));
2903 _igvn.register_new_node_with_optimizer(ex);
2904 _phase->set_ctrl(ex, _phase->get_ctrl(def));
2905 _igvn.replace_input_of(use, idx, ex);
2906 _igvn._worklist.push(def);
2907
2908 bb_insert_after(ex, bb_idx(def));
2909 set_velt_type(ex, velt_type(def));
2910 }
2911 }
2912
2913 //------------------------------is_vector_use---------------------------
2914 // Is use->in(u_idx) a vector use?
is_vector_use(Node * use,int u_idx)2915 bool SuperWord::is_vector_use(Node* use, int u_idx) {
2916 Node_List* u_pk = my_pack(use);
2917 if (u_pk == NULL) return false;
2918 if (use->is_reduction()) return true;
2919 Node* def = use->in(u_idx);
2920 Node_List* d_pk = my_pack(def);
2921 if (d_pk == NULL) {
2922 // check for scalar promotion
2923 Node* n = u_pk->at(0)->in(u_idx);
2924 for (uint i = 1; i < u_pk->size(); i++) {
2925 if (u_pk->at(i)->in(u_idx) != n) return false;
2926 }
2927 return true;
2928 }
2929 if (VectorNode::is_muladds2i(use)) {
2930 // MulAddS2I takes shorts and produces ints - hence the special checks
2931 // on alignment and size.
2932 if (u_pk->size() * 2 != d_pk->size()) {
2933 return false;
2934 }
2935 for (uint i = 0; i < MIN2(d_pk->size(), u_pk->size()); i++) {
2936 Node* ui = u_pk->at(i);
2937 Node* di = d_pk->at(i);
2938 if (alignment(ui) != alignment(di) * 2) {
2939 return false;
2940 }
2941 }
2942 return true;
2943 }
2944 if (u_pk->size() != d_pk->size())
2945 return false;
2946 for (uint i = 0; i < u_pk->size(); i++) {
2947 Node* ui = u_pk->at(i);
2948 Node* di = d_pk->at(i);
2949 if (ui->in(u_idx) != di || alignment(ui) != alignment(di))
2950 return false;
2951 }
2952 return true;
2953 }
2954
2955 //------------------------------construct_bb---------------------------
2956 // Construct reverse postorder list of block members
construct_bb()2957 bool SuperWord::construct_bb() {
2958 Node* entry = bb();
2959
2960 assert(_stk.length() == 0, "stk is empty");
2961 assert(_block.length() == 0, "block is empty");
2962 assert(_data_entry.length() == 0, "data_entry is empty");
2963 assert(_mem_slice_head.length() == 0, "mem_slice_head is empty");
2964 assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty");
2965
2966 // Find non-control nodes with no inputs from within block,
2967 // create a temporary map from node _idx to bb_idx for use
2968 // by the visited and post_visited sets,
2969 // and count number of nodes in block.
2970 int bb_ct = 0;
2971 for (uint i = 0; i < lpt()->_body.size(); i++) {
2972 Node *n = lpt()->_body.at(i);
2973 set_bb_idx(n, i); // Create a temporary map
2974 if (in_bb(n)) {
2975 if (n->is_LoadStore() || n->is_MergeMem() ||
2976 (n->is_Proj() && !n->as_Proj()->is_CFG())) {
2977 // Bailout if the loop has LoadStore, MergeMem or data Proj
2978 // nodes. Superword optimization does not work with them.
2979 return false;
2980 }
2981 bb_ct++;
2982 if (!n->is_CFG()) {
2983 bool found = false;
2984 for (uint j = 0; j < n->req(); j++) {
2985 Node* def = n->in(j);
2986 if (def && in_bb(def)) {
2987 found = true;
2988 break;
2989 }
2990 }
2991 if (!found) {
2992 assert(n != entry, "can't be entry");
2993 _data_entry.push(n);
2994 }
2995 }
2996 }
2997 }
2998
2999 // Find memory slices (head and tail)
3000 for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
3001 Node *n = lp()->fast_out(i);
3002 if (in_bb(n) && (n->is_Phi() && n->bottom_type() == Type::MEMORY)) {
3003 Node* n_tail = n->in(LoopNode::LoopBackControl);
3004 if (n_tail != n->in(LoopNode::EntryControl)) {
3005 if (!n_tail->is_Mem()) {
3006 assert(n_tail->is_Mem(), "unexpected node for memory slice: %s", n_tail->Name());
3007 return false; // Bailout
3008 }
3009 _mem_slice_head.push(n);
3010 _mem_slice_tail.push(n_tail);
3011 }
3012 }
3013 }
3014
3015 // Create an RPO list of nodes in block
3016
3017 visited_clear();
3018 post_visited_clear();
3019
3020 // Push all non-control nodes with no inputs from within block, then control entry
3021 for (int j = 0; j < _data_entry.length(); j++) {
3022 Node* n = _data_entry.at(j);
3023 visited_set(n);
3024 _stk.push(n);
3025 }
3026 visited_set(entry);
3027 _stk.push(entry);
3028
3029 // Do a depth first walk over out edges
3030 int rpo_idx = bb_ct - 1;
3031 int size;
3032 int reduction_uses = 0;
3033 while ((size = _stk.length()) > 0) {
3034 Node* n = _stk.top(); // Leave node on stack
3035 if (!visited_test_set(n)) {
3036 // forward arc in graph
3037 } else if (!post_visited_test(n)) {
3038 // cross or back arc
3039 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
3040 Node *use = n->fast_out(i);
3041 if (in_bb(use) && !visited_test(use) &&
3042 // Don't go around backedge
3043 (!use->is_Phi() || n == entry)) {
3044 if (use->is_reduction()) {
3045 // First see if we can map the reduction on the given system we are on, then
3046 // make a data entry operation for each reduction we see.
3047 BasicType bt = use->bottom_type()->basic_type();
3048 if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) {
3049 reduction_uses++;
3050 }
3051 }
3052 _stk.push(use);
3053 }
3054 }
3055 if (_stk.length() == size) {
3056 // There were no additional uses, post visit node now
3057 _stk.pop(); // Remove node from stack
3058 assert(rpo_idx >= 0, "");
3059 _block.at_put_grow(rpo_idx, n);
3060 rpo_idx--;
3061 post_visited_set(n);
3062 assert(rpo_idx >= 0 || _stk.is_empty(), "");
3063 }
3064 } else {
3065 _stk.pop(); // Remove post-visited node from stack
3066 }
3067 }//while
3068
3069 int ii_current = -1;
3070 unsigned int load_idx = (unsigned int)-1;
3071 // Build iterations order if needed
3072 bool build_ii_order = _do_vector_loop_experimental && _ii_order.is_empty();
3073 // Create real map of block indices for nodes
3074 for (int j = 0; j < _block.length(); j++) {
3075 Node* n = _block.at(j);
3076 set_bb_idx(n, j);
3077 if (build_ii_order && n->is_Load()) {
3078 if (ii_current == -1) {
3079 ii_current = _clone_map.gen(n->_idx);
3080 _ii_order.push(ii_current);
3081 load_idx = _clone_map.idx(n->_idx);
3082 } else if (_clone_map.idx(n->_idx) == load_idx && _clone_map.gen(n->_idx) != ii_current) {
3083 ii_current = _clone_map.gen(n->_idx);
3084 _ii_order.push(ii_current);
3085 }
3086 }
3087 }//for
3088
3089 // Ensure extra info is allocated.
3090 initialize_bb();
3091
3092 #ifndef PRODUCT
3093 if (_vector_loop_debug && _ii_order.length() > 0) {
3094 tty->print("SuperWord::construct_bb: List of generations: ");
3095 for (int jj = 0; jj < _ii_order.length(); ++jj) {
3096 tty->print(" %d:%d", jj, _ii_order.at(jj));
3097 }
3098 tty->print_cr(" ");
3099 }
3100 if (TraceSuperWord) {
3101 print_bb();
3102 tty->print_cr("\ndata entry nodes: %s", _data_entry.length() > 0 ? "" : "NONE");
3103 for (int m = 0; m < _data_entry.length(); m++) {
3104 tty->print("%3d ", m);
3105 _data_entry.at(m)->dump();
3106 }
3107 tty->print_cr("\nmemory slices: %s", _mem_slice_head.length() > 0 ? "" : "NONE");
3108 for (int m = 0; m < _mem_slice_head.length(); m++) {
3109 tty->print("%3d ", m); _mem_slice_head.at(m)->dump();
3110 tty->print(" "); _mem_slice_tail.at(m)->dump();
3111 }
3112 }
3113 #endif
3114 assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found");
3115 return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0);
3116 }
3117
3118 //------------------------------initialize_bb---------------------------
3119 // Initialize per node info
initialize_bb()3120 void SuperWord::initialize_bb() {
3121 Node* last = _block.at(_block.length() - 1);
3122 grow_node_info(bb_idx(last));
3123 }
3124
3125 //------------------------------bb_insert_after---------------------------
3126 // Insert n into block after pos
bb_insert_after(Node * n,int pos)3127 void SuperWord::bb_insert_after(Node* n, int pos) {
3128 int n_pos = pos + 1;
3129 // Make room
3130 for (int i = _block.length() - 1; i >= n_pos; i--) {
3131 _block.at_put_grow(i+1, _block.at(i));
3132 }
3133 for (int j = _node_info.length() - 1; j >= n_pos; j--) {
3134 _node_info.at_put_grow(j+1, _node_info.at(j));
3135 }
3136 // Set value
3137 _block.at_put_grow(n_pos, n);
3138 _node_info.at_put_grow(n_pos, SWNodeInfo::initial);
3139 // Adjust map from node->_idx to _block index
3140 for (int i = n_pos; i < _block.length(); i++) {
3141 set_bb_idx(_block.at(i), i);
3142 }
3143 }
3144
3145 //------------------------------compute_max_depth---------------------------
3146 // Compute max depth for expressions from beginning of block
3147 // Use to prune search paths during test for independence.
compute_max_depth()3148 void SuperWord::compute_max_depth() {
3149 int ct = 0;
3150 bool again;
3151 do {
3152 again = false;
3153 for (int i = 0; i < _block.length(); i++) {
3154 Node* n = _block.at(i);
3155 if (!n->is_Phi()) {
3156 int d_orig = depth(n);
3157 int d_in = 0;
3158 for (DepPreds preds(n, _dg); !preds.done(); preds.next()) {
3159 Node* pred = preds.current();
3160 if (in_bb(pred)) {
3161 d_in = MAX2(d_in, depth(pred));
3162 }
3163 }
3164 if (d_in + 1 != d_orig) {
3165 set_depth(n, d_in + 1);
3166 again = true;
3167 }
3168 }
3169 }
3170 ct++;
3171 } while (again);
3172
3173 if (TraceSuperWord && Verbose) {
3174 tty->print_cr("compute_max_depth iterated: %d times", ct);
3175 }
3176 }
3177
3178 //-------------------------compute_vector_element_type-----------------------
3179 // Compute necessary vector element type for expressions
3180 // This propagates backwards a narrower integer type when the
3181 // upper bits of the value are not needed.
3182 // Example: char a,b,c; a = b + c;
3183 // Normally the type of the add is integer, but for packed character
3184 // operations the type of the add needs to be char.
compute_vector_element_type()3185 void SuperWord::compute_vector_element_type() {
3186 if (TraceSuperWord && Verbose) {
3187 tty->print_cr("\ncompute_velt_type:");
3188 }
3189
3190 // Initial type
3191 for (int i = 0; i < _block.length(); i++) {
3192 Node* n = _block.at(i);
3193 set_velt_type(n, container_type(n));
3194 }
3195
3196 // Propagate integer narrowed type backwards through operations
3197 // that don't depend on higher order bits
3198 for (int i = _block.length() - 1; i >= 0; i--) {
3199 Node* n = _block.at(i);
3200 // Only integer types need be examined
3201 const Type* vtn = velt_type(n);
3202 if (vtn->basic_type() == T_INT) {
3203 uint start, end;
3204 VectorNode::vector_operands(n, &start, &end);
3205
3206 for (uint j = start; j < end; j++) {
3207 Node* in = n->in(j);
3208 // Don't propagate through a memory
3209 if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
3210 data_size(n) < data_size(in)) {
3211 bool same_type = true;
3212 for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
3213 Node *use = in->fast_out(k);
3214 if (!in_bb(use) || !same_velt_type(use, n)) {
3215 same_type = false;
3216 break;
3217 }
3218 }
3219 if (same_type) {
3220 // In any Java arithmetic operation, operands of small integer types
3221 // (boolean, byte, char & short) should be promoted to int first. As
3222 // vector elements of small types don't have upper bits of int, for
3223 // RShiftI or AbsI operations, the compiler has to know the precise
3224 // signedness info of the 1st operand. These operations shouldn't be
3225 // vectorized if the signedness info is imprecise.
3226 const Type* vt = vtn;
3227 int op = in->Opcode();
3228 if (VectorNode::is_shift_opcode(op) || op == Op_AbsI) {
3229 Node* load = in->in(1);
3230 if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) {
3231 // Only Load nodes distinguish signed (LoadS/LoadB) and unsigned
3232 // (LoadUS/LoadUB) values. Store nodes only have one version.
3233 vt = velt_type(load);
3234 } else if (op != Op_LShiftI) {
3235 // Widen type to int to avoid the creation of vector nodes. Note
3236 // that left shifts work regardless of the signedness.
3237 vt = TypeInt::INT;
3238 }
3239 }
3240 set_velt_type(in, vt);
3241 }
3242 }
3243 }
3244 }
3245 }
3246 #ifndef PRODUCT
3247 if (TraceSuperWord && Verbose) {
3248 for (int i = 0; i < _block.length(); i++) {
3249 Node* n = _block.at(i);
3250 velt_type(n)->dump();
3251 tty->print("\t");
3252 n->dump();
3253 }
3254 }
3255 #endif
3256 }
3257
3258 //------------------------------memory_alignment---------------------------
3259 // Alignment within a vector memory reference
memory_alignment(MemNode * s,int iv_adjust)3260 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
3261 #ifndef PRODUCT
3262 if ((TraceSuperWord && Verbose) || is_trace_alignment()) {
3263 tty->print("SuperWord::memory_alignment within a vector memory reference for %d: ", s->_idx); s->dump();
3264 }
3265 #endif
3266 NOT_PRODUCT(SWPointer::Tracer::Depth ddd(0);)
3267 SWPointer p(s, this, NULL, false);
3268 if (!p.valid()) {
3269 NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align");)
3270 return bottom_align;
3271 }
3272 int vw = get_vw_bytes_special(s);
3273 if (vw < 2) {
3274 NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
3275 return bottom_align; // No vectors for this type
3276 }
3277 int offset = p.offset_in_bytes();
3278 offset += iv_adjust*p.memory_size();
3279 int off_rem = offset % vw;
3280 int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
3281 #ifndef PRODUCT
3282 if ((TraceSuperWord && Verbose) || is_trace_alignment()) {
3283 tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d", off_rem, off_mod);
3284 }
3285 #endif
3286 return off_mod;
3287 }
3288
3289 //---------------------------container_type---------------------------
3290 // Smallest type containing range of values
container_type(Node * n)3291 const Type* SuperWord::container_type(Node* n) {
3292 if (n->is_Mem()) {
3293 BasicType bt = n->as_Mem()->memory_type();
3294 if (n->is_Store() && (bt == T_CHAR)) {
3295 // Use T_SHORT type instead of T_CHAR for stored values because any
3296 // preceding arithmetic operation extends values to signed Int.
3297 bt = T_SHORT;
3298 }
3299 if (n->Opcode() == Op_LoadUB) {
3300 // Adjust type for unsigned byte loads, it is important for right shifts.
3301 // T_BOOLEAN is used because there is no basic type representing type
3302 // TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only
3303 // size (one byte) and sign is important.
3304 bt = T_BOOLEAN;
3305 }
3306 return Type::get_const_basic_type(bt);
3307 }
3308 const Type* t = _igvn.type(n);
3309 if (t->basic_type() == T_INT) {
3310 // A narrow type of arithmetic operations will be determined by
3311 // propagating the type of memory operations.
3312 return TypeInt::INT;
3313 }
3314 return t;
3315 }
3316
same_velt_type(Node * n1,Node * n2)3317 bool SuperWord::same_velt_type(Node* n1, Node* n2) {
3318 const Type* vt1 = velt_type(n1);
3319 const Type* vt2 = velt_type(n2);
3320 if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
3321 // Compare vectors element sizes for integer types.
3322 return data_size(n1) == data_size(n2);
3323 }
3324 return vt1 == vt2;
3325 }
3326
3327 //------------------------------in_packset---------------------------
3328 // Are s1 and s2 in a pack pair and ordered as s1,s2?
in_packset(Node * s1,Node * s2)3329 bool SuperWord::in_packset(Node* s1, Node* s2) {
3330 for (int i = 0; i < _packset.length(); i++) {
3331 Node_List* p = _packset.at(i);
3332 assert(p->size() == 2, "must be");
3333 if (p->at(0) == s1 && p->at(p->size()-1) == s2) {
3334 return true;
3335 }
3336 }
3337 return false;
3338 }
3339
3340 //------------------------------in_pack---------------------------
3341 // Is s in pack p?
in_pack(Node * s,Node_List * p)3342 Node_List* SuperWord::in_pack(Node* s, Node_List* p) {
3343 for (uint i = 0; i < p->size(); i++) {
3344 if (p->at(i) == s) {
3345 return p;
3346 }
3347 }
3348 return NULL;
3349 }
3350
3351 //------------------------------remove_pack_at---------------------------
3352 // Remove the pack at position pos in the packset
remove_pack_at(int pos)3353 void SuperWord::remove_pack_at(int pos) {
3354 Node_List* p = _packset.at(pos);
3355 for (uint i = 0; i < p->size(); i++) {
3356 Node* s = p->at(i);
3357 set_my_pack(s, NULL);
3358 }
3359 _packset.remove_at(pos);
3360 }
3361
packset_sort(int n)3362 void SuperWord::packset_sort(int n) {
3363 // simple bubble sort so that we capitalize with O(n) when its already sorted
3364 while (n != 0) {
3365 bool swapped = false;
3366 for (int i = 1; i < n; i++) {
3367 Node_List* q_low = _packset.at(i-1);
3368 Node_List* q_i = _packset.at(i);
3369
3370 // only swap when we find something to swap
3371 if (alignment(q_low->at(0)) > alignment(q_i->at(0))) {
3372 Node_List* t = q_i;
3373 *(_packset.adr_at(i)) = q_low;
3374 *(_packset.adr_at(i-1)) = q_i;
3375 swapped = true;
3376 }
3377 }
3378 if (swapped == false) break;
3379 n--;
3380 }
3381 }
3382
3383 //------------------------------executed_first---------------------------
3384 // Return the node executed first in pack p. Uses the RPO block list
3385 // to determine order.
executed_first(Node_List * p)3386 Node* SuperWord::executed_first(Node_List* p) {
3387 Node* n = p->at(0);
3388 int n_rpo = bb_idx(n);
3389 for (uint i = 1; i < p->size(); i++) {
3390 Node* s = p->at(i);
3391 int s_rpo = bb_idx(s);
3392 if (s_rpo < n_rpo) {
3393 n = s;
3394 n_rpo = s_rpo;
3395 }
3396 }
3397 return n;
3398 }
3399
3400 //------------------------------executed_last---------------------------
3401 // Return the node executed last in pack p.
executed_last(Node_List * p)3402 Node* SuperWord::executed_last(Node_List* p) {
3403 Node* n = p->at(0);
3404 int n_rpo = bb_idx(n);
3405 for (uint i = 1; i < p->size(); i++) {
3406 Node* s = p->at(i);
3407 int s_rpo = bb_idx(s);
3408 if (s_rpo > n_rpo) {
3409 n = s;
3410 n_rpo = s_rpo;
3411 }
3412 }
3413 return n;
3414 }
3415
control_dependency(Node_List * p)3416 LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
3417 LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
3418 for (uint i = 0; i < p->size(); i++) {
3419 Node* n = p->at(i);
3420 assert(n->is_Load(), "only meaningful for loads");
3421 if (!n->depends_only_on_test()) {
3422 if (n->as_Load()->has_unknown_control_dependency() &&
3423 dep != LoadNode::Pinned) {
3424 // Upgrade to unknown control...
3425 dep = LoadNode::UnknownControl;
3426 } else {
3427 // Otherwise, we must pin it.
3428 dep = LoadNode::Pinned;
3429 }
3430 }
3431 }
3432 return dep;
3433 }
3434
3435
3436 //----------------------------align_initial_loop_index---------------------------
3437 // Adjust pre-loop limit so that in main loop, a load/store reference
3438 // to align_to_ref will be a position zero in the vector.
3439 // (iv + k) mod vector_align == 0
align_initial_loop_index(MemNode * align_to_ref)3440 void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
3441 assert(lp()->is_main_loop(), "");
3442 CountedLoopEndNode* pre_end = pre_loop_end();
3443 Node* pre_opaq1 = pre_end->limit();
3444 assert(pre_opaq1->Opcode() == Op_Opaque1, "");
3445 Opaque1Node* pre_opaq = (Opaque1Node*)pre_opaq1;
3446 Node* lim0 = pre_opaq->in(1);
3447
3448 // Where we put new limit calculations
3449 Node* pre_ctrl = pre_loop_head()->in(LoopNode::EntryControl);
3450
3451 // Ensure the original loop limit is available from the
3452 // pre-loop Opaque1 node.
3453 Node* orig_limit = pre_opaq->original_loop_limit();
3454 assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
3455
3456 SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
3457 assert(align_to_ref_p.valid(), "sanity");
3458
3459 // Given:
3460 // lim0 == original pre loop limit
3461 // V == v_align (power of 2)
3462 // invar == extra invariant piece of the address expression
3463 // e == offset [ +/- invar ]
3464 //
3465 // When reassociating expressions involving '%' the basic rules are:
3466 // (a - b) % k == 0 => a % k == b % k
3467 // and:
3468 // (a + b) % k == 0 => a % k == (k - b) % k
3469 //
3470 // For stride > 0 && scale > 0,
3471 // Derive the new pre-loop limit "lim" such that the two constraints:
3472 // (1) lim = lim0 + N (where N is some positive integer < V)
3473 // (2) (e + lim) % V == 0
3474 // are true.
3475 //
3476 // Substituting (1) into (2),
3477 // (e + lim0 + N) % V == 0
3478 // solve for N:
3479 // N = (V - (e + lim0)) % V
3480 // substitute back into (1), so that new limit
3481 // lim = lim0 + (V - (e + lim0)) % V
3482 //
3483 // For stride > 0 && scale < 0
3484 // Constraints:
3485 // lim = lim0 + N
3486 // (e - lim) % V == 0
3487 // Solving for lim:
3488 // (e - lim0 - N) % V == 0
3489 // N = (e - lim0) % V
3490 // lim = lim0 + (e - lim0) % V
3491 //
3492 // For stride < 0 && scale > 0
3493 // Constraints:
3494 // lim = lim0 - N
3495 // (e + lim) % V == 0
3496 // Solving for lim:
3497 // (e + lim0 - N) % V == 0
3498 // N = (e + lim0) % V
3499 // lim = lim0 - (e + lim0) % V
3500 //
3501 // For stride < 0 && scale < 0
3502 // Constraints:
3503 // lim = lim0 - N
3504 // (e - lim) % V == 0
3505 // Solving for lim:
3506 // (e - lim0 + N) % V == 0
3507 // N = (V - (e - lim0)) % V
3508 // lim = lim0 - (V - (e - lim0)) % V
3509
3510 int vw = vector_width_in_bytes(align_to_ref);
3511 int stride = iv_stride();
3512 int scale = align_to_ref_p.scale_in_bytes();
3513 int elt_size = align_to_ref_p.memory_size();
3514 int v_align = vw / elt_size;
3515 assert(v_align > 1, "sanity");
3516 int offset = align_to_ref_p.offset_in_bytes() / elt_size;
3517 Node *offsn = _igvn.intcon(offset);
3518
3519 Node *e = offsn;
3520 if (align_to_ref_p.invar() != NULL) {
3521 // incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt)
3522 Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
3523 Node* invar = align_to_ref_p.invar();
3524 if (_igvn.type(invar)->isa_long()) {
3525 // Computations are done % (vector width/element size) so it's
3526 // safe to simply convert invar to an int and loose the upper 32
3527 // bit half.
3528 invar = new ConvL2INode(invar);
3529 _igvn.register_new_node_with_optimizer(invar);
3530 }
3531 Node* invar_scale = align_to_ref_p.invar_scale();
3532 if (invar_scale != NULL) {
3533 invar = new LShiftINode(invar, invar_scale);
3534 _igvn.register_new_node_with_optimizer(invar);
3535 }
3536 Node* aref = new URShiftINode(invar, log2_elt);
3537 _igvn.register_new_node_with_optimizer(aref);
3538 _phase->set_ctrl(aref, pre_ctrl);
3539 if (align_to_ref_p.negate_invar()) {
3540 e = new SubINode(e, aref);
3541 } else {
3542 e = new AddINode(e, aref);
3543 }
3544 _igvn.register_new_node_with_optimizer(e);
3545 _phase->set_ctrl(e, pre_ctrl);
3546 }
3547 if (vw > ObjectAlignmentInBytes || align_to_ref_p.base()->is_top()) {
3548 // incorporate base e +/- base && Mask >>> log2(elt)
3549 Node* xbase = new CastP2XNode(NULL, align_to_ref_p.adr());
3550 _igvn.register_new_node_with_optimizer(xbase);
3551 #ifdef _LP64
3552 xbase = new ConvL2INode(xbase);
3553 _igvn.register_new_node_with_optimizer(xbase);
3554 #endif
3555 Node* mask = _igvn.intcon(vw-1);
3556 Node* masked_xbase = new AndINode(xbase, mask);
3557 _igvn.register_new_node_with_optimizer(masked_xbase);
3558 Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
3559 Node* bref = new URShiftINode(masked_xbase, log2_elt);
3560 _igvn.register_new_node_with_optimizer(bref);
3561 _phase->set_ctrl(bref, pre_ctrl);
3562 e = new AddINode(e, bref);
3563 _igvn.register_new_node_with_optimizer(e);
3564 _phase->set_ctrl(e, pre_ctrl);
3565 }
3566
3567 // compute e +/- lim0
3568 if (scale < 0) {
3569 e = new SubINode(e, lim0);
3570 } else {
3571 e = new AddINode(e, lim0);
3572 }
3573 _igvn.register_new_node_with_optimizer(e);
3574 _phase->set_ctrl(e, pre_ctrl);
3575
3576 if (stride * scale > 0) {
3577 // compute V - (e +/- lim0)
3578 Node* va = _igvn.intcon(v_align);
3579 e = new SubINode(va, e);
3580 _igvn.register_new_node_with_optimizer(e);
3581 _phase->set_ctrl(e, pre_ctrl);
3582 }
3583 // compute N = (exp) % V
3584 Node* va_msk = _igvn.intcon(v_align - 1);
3585 Node* N = new AndINode(e, va_msk);
3586 _igvn.register_new_node_with_optimizer(N);
3587 _phase->set_ctrl(N, pre_ctrl);
3588
3589 // substitute back into (1), so that new limit
3590 // lim = lim0 + N
3591 Node* lim;
3592 if (stride < 0) {
3593 lim = new SubINode(lim0, N);
3594 } else {
3595 lim = new AddINode(lim0, N);
3596 }
3597 _igvn.register_new_node_with_optimizer(lim);
3598 _phase->set_ctrl(lim, pre_ctrl);
3599 Node* constrained =
3600 (stride > 0) ? (Node*) new MinINode(lim, orig_limit)
3601 : (Node*) new MaxINode(lim, orig_limit);
3602 _igvn.register_new_node_with_optimizer(constrained);
3603 _phase->set_ctrl(constrained, pre_ctrl);
3604 _igvn.replace_input_of(pre_opaq, 1, constrained);
3605 }
3606
3607 //----------------------------get_pre_loop_end---------------------------
3608 // Find pre loop end from main loop. Returns null if none.
find_pre_loop_end(CountedLoopNode * cl) const3609 CountedLoopEndNode* SuperWord::find_pre_loop_end(CountedLoopNode* cl) const {
3610 // The loop cannot be optimized if the graph shape at
3611 // the loop entry is inappropriate.
3612 if (!PhaseIdealLoop::is_canonical_loop_entry(cl)) {
3613 return NULL;
3614 }
3615
3616 Node* p_f = cl->skip_predicates()->in(0)->in(0);
3617 if (!p_f->is_IfFalse()) return NULL;
3618 if (!p_f->in(0)->is_CountedLoopEnd()) return NULL;
3619 CountedLoopEndNode* pre_end = p_f->in(0)->as_CountedLoopEnd();
3620 CountedLoopNode* loop_node = pre_end->loopnode();
3621 if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL;
3622 return pre_end;
3623 }
3624
3625 //------------------------------init---------------------------
init()3626 void SuperWord::init() {
3627 _dg.init();
3628 _packset.clear();
3629 _disjoint_ptrs.clear();
3630 _block.clear();
3631 _post_block.clear();
3632 _data_entry.clear();
3633 _mem_slice_head.clear();
3634 _mem_slice_tail.clear();
3635 _iteration_first.clear();
3636 _iteration_last.clear();
3637 _node_info.clear();
3638 _align_to_ref = NULL;
3639 _lpt = NULL;
3640 _lp = NULL;
3641 _bb = NULL;
3642 _iv = NULL;
3643 _race_possible = 0;
3644 _early_return = false;
3645 _num_work_vecs = 0;
3646 _num_reductions = 0;
3647 }
3648
3649 //------------------------------restart---------------------------
restart()3650 void SuperWord::restart() {
3651 _dg.init();
3652 _packset.clear();
3653 _disjoint_ptrs.clear();
3654 _block.clear();
3655 _post_block.clear();
3656 _data_entry.clear();
3657 _mem_slice_head.clear();
3658 _mem_slice_tail.clear();
3659 _node_info.clear();
3660 }
3661
3662 //------------------------------print_packset---------------------------
print_packset()3663 void SuperWord::print_packset() {
3664 #ifndef PRODUCT
3665 tty->print_cr("packset");
3666 for (int i = 0; i < _packset.length(); i++) {
3667 tty->print_cr("Pack: %d", i);
3668 Node_List* p = _packset.at(i);
3669 print_pack(p);
3670 }
3671 #endif
3672 }
3673
3674 //------------------------------print_pack---------------------------
print_pack(Node_List * p)3675 void SuperWord::print_pack(Node_List* p) {
3676 for (uint i = 0; i < p->size(); i++) {
3677 print_stmt(p->at(i));
3678 }
3679 }
3680
3681 //------------------------------print_bb---------------------------
print_bb()3682 void SuperWord::print_bb() {
3683 #ifndef PRODUCT
3684 tty->print_cr("\nBlock");
3685 for (int i = 0; i < _block.length(); i++) {
3686 Node* n = _block.at(i);
3687 tty->print("%d ", i);
3688 if (n) {
3689 n->dump();
3690 }
3691 }
3692 #endif
3693 }
3694
3695 //------------------------------print_stmt---------------------------
print_stmt(Node * s)3696 void SuperWord::print_stmt(Node* s) {
3697 #ifndef PRODUCT
3698 tty->print(" align: %d \t", alignment(s));
3699 s->dump();
3700 #endif
3701 }
3702
3703 //------------------------------blank---------------------------
blank(uint depth)3704 char* SuperWord::blank(uint depth) {
3705 static char blanks[101];
3706 assert(depth < 101, "too deep");
3707 for (uint i = 0; i < depth; i++) blanks[i] = ' ';
3708 blanks[depth] = '\0';
3709 return blanks;
3710 }
3711
3712
3713 //==============================SWPointer===========================
3714 #ifndef PRODUCT
3715 int SWPointer::Tracer::_depth = 0;
3716 #endif
3717 //----------------------------SWPointer------------------------
SWPointer(MemNode * mem,SuperWord * slp,Node_Stack * nstack,bool analyze_only)3718 SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
3719 _mem(mem), _slp(slp), _base(NULL), _adr(NULL),
3720 _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
3721 _invar_scale(NULL),
3722 _nstack(nstack), _analyze_only(analyze_only),
3723 _stack_idx(0)
3724 #ifndef PRODUCT
3725 , _tracer(slp)
3726 #endif
3727 {
3728 NOT_PRODUCT(_tracer.ctor_1(mem);)
3729
3730 Node* adr = mem->in(MemNode::Address);
3731 if (!adr->is_AddP()) {
3732 assert(!valid(), "too complex");
3733 return;
3734 }
3735 // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
3736 Node* base = adr->in(AddPNode::Base);
3737 // The base address should be loop invariant
3738 if (is_main_loop_member(base)) {
3739 assert(!valid(), "base address is loop variant");
3740 return;
3741 }
3742 // unsafe references require misaligned vector access support
3743 if (base->is_top() && !Matcher::misaligned_vectors_ok()) {
3744 assert(!valid(), "unsafe access");
3745 return;
3746 }
3747
3748 NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.store_depth();)
3749 NOT_PRODUCT(_tracer.ctor_2(adr);)
3750
3751 int i;
3752 for (i = 0; i < 3; i++) {
3753 NOT_PRODUCT(_tracer.ctor_3(adr, i);)
3754
3755 if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
3756 assert(!valid(), "too complex");
3757 return;
3758 }
3759 adr = adr->in(AddPNode::Address);
3760 NOT_PRODUCT(_tracer.ctor_4(adr, i);)
3761
3762 if (base == adr || !adr->is_AddP()) {
3763 NOT_PRODUCT(_tracer.ctor_5(adr, base, i);)
3764 break; // stop looking at addp's
3765 }
3766 }
3767 if (is_main_loop_member(adr)) {
3768 assert(!valid(), "adr is loop variant");
3769 return;
3770 }
3771
3772 if (!base->is_top() && adr != base) {
3773 assert(!valid(), "adr and base differ");
3774 return;
3775 }
3776
3777 NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.restore_depth();)
3778 NOT_PRODUCT(_tracer.ctor_6(mem);)
3779
3780 _base = base;
3781 _adr = adr;
3782 assert(valid(), "Usable");
3783 }
3784
3785 // Following is used to create a temporary object during
3786 // the pattern match of an address expression.
SWPointer(SWPointer * p)3787 SWPointer::SWPointer(SWPointer* p) :
3788 _mem(p->_mem), _slp(p->_slp), _base(NULL), _adr(NULL),
3789 _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
3790 _invar_scale(NULL),
3791 _nstack(p->_nstack), _analyze_only(p->_analyze_only),
3792 _stack_idx(p->_stack_idx)
3793 #ifndef PRODUCT
3794 , _tracer(p->_slp)
3795 #endif
3796 {}
3797
is_main_loop_member(Node * n) const3798 bool SWPointer::is_main_loop_member(Node* n) const {
3799 Node* n_c = phase()->get_ctrl(n);
3800 return lpt()->is_member(phase()->get_loop(n_c));
3801 }
3802
invariant(Node * n) const3803 bool SWPointer::invariant(Node* n) const {
3804 NOT_PRODUCT(Tracer::Depth dd;)
3805 Node* n_c = phase()->get_ctrl(n);
3806 NOT_PRODUCT(_tracer.invariant_1(n, n_c);)
3807 bool is_not_member = !is_main_loop_member(n);
3808 if (is_not_member && _slp->lp()->is_main_loop()) {
3809 // Check that n_c dominates the pre loop head node. If it does not, then we cannot use n as invariant for the pre loop
3810 // CountedLoopEndNode check because n_c is either part of the pre loop or between the pre and the main loop (illegal
3811 // invariant: Happens, for example, when n_c is a CastII node that prevents data nodes to flow above the main loop).
3812 return phase()->is_dominator(n_c, _slp->pre_loop_head());
3813 }
3814 return is_not_member;
3815 }
3816
3817 //------------------------scaled_iv_plus_offset--------------------
3818 // Match: k*iv + offset
3819 // where: k is a constant that maybe zero, and
3820 // offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
scaled_iv_plus_offset(Node * n)3821 bool SWPointer::scaled_iv_plus_offset(Node* n) {
3822 NOT_PRODUCT(Tracer::Depth ddd;)
3823 NOT_PRODUCT(_tracer.scaled_iv_plus_offset_1(n);)
3824
3825 if (scaled_iv(n)) {
3826 NOT_PRODUCT(_tracer.scaled_iv_plus_offset_2(n);)
3827 return true;
3828 }
3829
3830 if (offset_plus_k(n)) {
3831 NOT_PRODUCT(_tracer.scaled_iv_plus_offset_3(n);)
3832 return true;
3833 }
3834
3835 int opc = n->Opcode();
3836 if (opc == Op_AddI) {
3837 if (offset_plus_k(n->in(2)) && scaled_iv_plus_offset(n->in(1))) {
3838 NOT_PRODUCT(_tracer.scaled_iv_plus_offset_4(n);)
3839 return true;
3840 }
3841 if (offset_plus_k(n->in(1)) && scaled_iv_plus_offset(n->in(2))) {
3842 NOT_PRODUCT(_tracer.scaled_iv_plus_offset_5(n);)
3843 return true;
3844 }
3845 } else if (opc == Op_SubI) {
3846 if (offset_plus_k(n->in(2), true) && scaled_iv_plus_offset(n->in(1))) {
3847 NOT_PRODUCT(_tracer.scaled_iv_plus_offset_6(n);)
3848 return true;
3849 }
3850 if (offset_plus_k(n->in(1)) && scaled_iv_plus_offset(n->in(2))) {
3851 _scale *= -1;
3852 NOT_PRODUCT(_tracer.scaled_iv_plus_offset_7(n);)
3853 return true;
3854 }
3855 }
3856
3857 NOT_PRODUCT(_tracer.scaled_iv_plus_offset_8(n);)
3858 return false;
3859 }
3860
3861 //----------------------------scaled_iv------------------------
3862 // Match: k*iv where k is a constant that's not zero
scaled_iv(Node * n)3863 bool SWPointer::scaled_iv(Node* n) {
3864 NOT_PRODUCT(Tracer::Depth ddd;)
3865 NOT_PRODUCT(_tracer.scaled_iv_1(n);)
3866
3867 if (_scale != 0) { // already found a scale
3868 NOT_PRODUCT(_tracer.scaled_iv_2(n, _scale);)
3869 return false;
3870 }
3871
3872 if (n == iv()) {
3873 _scale = 1;
3874 NOT_PRODUCT(_tracer.scaled_iv_3(n, _scale);)
3875 return true;
3876 }
3877 if (_analyze_only && (is_main_loop_member(n))) {
3878 _nstack->push(n, _stack_idx++);
3879 }
3880
3881 int opc = n->Opcode();
3882 if (opc == Op_MulI) {
3883 if (n->in(1) == iv() && n->in(2)->is_Con()) {
3884 _scale = n->in(2)->get_int();
3885 NOT_PRODUCT(_tracer.scaled_iv_4(n, _scale);)
3886 return true;
3887 } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
3888 _scale = n->in(1)->get_int();
3889 NOT_PRODUCT(_tracer.scaled_iv_5(n, _scale);)
3890 return true;
3891 }
3892 } else if (opc == Op_LShiftI) {
3893 if (n->in(1) == iv() && n->in(2)->is_Con()) {
3894 _scale = 1 << n->in(2)->get_int();
3895 NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);)
3896 return true;
3897 }
3898 } else if (opc == Op_ConvI2L) {
3899 if (n->in(1)->Opcode() == Op_CastII &&
3900 n->in(1)->as_CastII()->has_range_check()) {
3901 // Skip range check dependent CastII nodes
3902 n = n->in(1);
3903 }
3904 if (scaled_iv_plus_offset(n->in(1))) {
3905 NOT_PRODUCT(_tracer.scaled_iv_7(n);)
3906 return true;
3907 }
3908 } else if (opc == Op_LShiftL && n->in(2)->is_Con()) {
3909 if (!has_iv() && _invar == NULL) {
3910 // Need to preserve the current _offset value, so
3911 // create a temporary object for this expression subtree.
3912 // Hacky, so should re-engineer the address pattern match.
3913 NOT_PRODUCT(Tracer::Depth dddd;)
3914 SWPointer tmp(this);
3915 NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);)
3916
3917 if (tmp.scaled_iv_plus_offset(n->in(1))) {
3918 int scale = n->in(2)->get_int();
3919 _scale = tmp._scale << scale;
3920 _offset += tmp._offset << scale;
3921 _invar = tmp._invar;
3922 if (_invar != NULL) {
3923 _negate_invar = tmp._negate_invar;
3924 _invar_scale = n->in(2);
3925 }
3926 NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, _invar, _negate_invar);)
3927 return true;
3928 }
3929 }
3930 }
3931 NOT_PRODUCT(_tracer.scaled_iv_10(n);)
3932 return false;
3933 }
3934
3935 //----------------------------offset_plus_k------------------------
3936 // Match: offset is (k [+/- invariant])
3937 // where k maybe zero and invariant is optional, but not both.
offset_plus_k(Node * n,bool negate)3938 bool SWPointer::offset_plus_k(Node* n, bool negate) {
3939 NOT_PRODUCT(Tracer::Depth ddd;)
3940 NOT_PRODUCT(_tracer.offset_plus_k_1(n);)
3941
3942 int opc = n->Opcode();
3943 if (opc == Op_ConI) {
3944 _offset += negate ? -(n->get_int()) : n->get_int();
3945 NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);)
3946 return true;
3947 } else if (opc == Op_ConL) {
3948 // Okay if value fits into an int
3949 const TypeLong* t = n->find_long_type();
3950 if (t->higher_equal(TypeLong::INT)) {
3951 jlong loff = n->get_long();
3952 jint off = (jint)loff;
3953 _offset += negate ? -off : loff;
3954 NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);)
3955 return true;
3956 }
3957 NOT_PRODUCT(_tracer.offset_plus_k_4(n);)
3958 return false;
3959 }
3960 if (_invar != NULL) { // already has an invariant
3961 NOT_PRODUCT(_tracer.offset_plus_k_5(n, _invar);)
3962 return false;
3963 }
3964
3965 if (_analyze_only && is_main_loop_member(n)) {
3966 _nstack->push(n, _stack_idx++);
3967 }
3968 if (opc == Op_AddI) {
3969 if (n->in(2)->is_Con() && invariant(n->in(1))) {
3970 _negate_invar = negate;
3971 _invar = n->in(1);
3972 _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
3973 NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, _negate_invar, _offset);)
3974 return true;
3975 } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
3976 _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
3977 _negate_invar = negate;
3978 _invar = n->in(2);
3979 NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, _negate_invar, _offset);)
3980 return true;
3981 }
3982 }
3983 if (opc == Op_SubI) {
3984 if (n->in(2)->is_Con() && invariant(n->in(1))) {
3985 _negate_invar = negate;
3986 _invar = n->in(1);
3987 _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
3988 NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, _negate_invar, _offset);)
3989 return true;
3990 } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
3991 _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
3992 _negate_invar = !negate;
3993 _invar = n->in(2);
3994 NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, _negate_invar, _offset);)
3995 return true;
3996 }
3997 }
3998
3999 if (!is_main_loop_member(n)) {
4000 // 'n' is loop invariant. Skip range check dependent CastII nodes before checking if 'n' is dominating the pre loop.
4001 if (opc == Op_ConvI2L) {
4002 n = n->in(1);
4003 if (n->Opcode() == Op_CastII &&
4004 n->as_CastII()->has_range_check()) {
4005 // Skip range check dependent CastII nodes
4006 assert(!is_main_loop_member(n), "sanity");
4007 n = n->in(1);
4008 }
4009 }
4010 // Check if 'n' can really be used as invariant (not in main loop and dominating the pre loop).
4011 if (invariant(n)) {
4012 _negate_invar = negate;
4013 _invar = n;
4014 NOT_PRODUCT(_tracer.offset_plus_k_10(n, _invar, _negate_invar, _offset);)
4015 return true;
4016 }
4017 }
4018
4019 NOT_PRODUCT(_tracer.offset_plus_k_11(n);)
4020 return false;
4021 }
4022
4023 //----------------------------print------------------------
print()4024 void SWPointer::print() {
4025 #ifndef PRODUCT
4026 tty->print("base: [%d] adr: [%d] scale: %d offset: %d",
4027 _base != NULL ? _base->_idx : 0,
4028 _adr != NULL ? _adr->_idx : 0,
4029 _scale, _offset);
4030 if (_invar != NULL) {
4031 tty->print(" invar: %c[%d] << [%d]", _negate_invar?'-':'+', _invar->_idx, _invar_scale->_idx);
4032 }
4033 tty->cr();
4034 #endif
4035 }
4036
4037 //----------------------------tracing------------------------
4038 #ifndef PRODUCT
print_depth() const4039 void SWPointer::Tracer::print_depth() const {
4040 for (int ii = 0; ii < _depth; ++ii) {
4041 tty->print(" ");
4042 }
4043 }
4044
ctor_1(Node * mem)4045 void SWPointer::Tracer::ctor_1 (Node* mem) {
4046 if(_slp->is_trace_alignment()) {
4047 print_depth(); tty->print(" %d SWPointer::SWPointer: start alignment analysis", mem->_idx); mem->dump();
4048 }
4049 }
4050
ctor_2(Node * adr)4051 void SWPointer::Tracer::ctor_2(Node* adr) {
4052 if(_slp->is_trace_alignment()) {
4053 //store_depth();
4054 inc_depth();
4055 print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: ", adr->_idx); adr->dump();
4056 inc_depth();
4057 print_depth(); tty->print(" %d (base) SWPointer::SWPointer: ", adr->in(AddPNode::Base)->_idx); adr->in(AddPNode::Base)->dump();
4058 }
4059 }
4060
ctor_3(Node * adr,int i)4061 void SWPointer::Tracer::ctor_3(Node* adr, int i) {
4062 if(_slp->is_trace_alignment()) {
4063 inc_depth();
4064 Node* offset = adr->in(AddPNode::Offset);
4065 print_depth(); tty->print(" %d (offset) SWPointer::SWPointer: i = %d: ", offset->_idx, i); offset->dump();
4066 }
4067 }
4068
ctor_4(Node * adr,int i)4069 void SWPointer::Tracer::ctor_4(Node* adr, int i) {
4070 if(_slp->is_trace_alignment()) {
4071 inc_depth();
4072 print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: i = %d: ", adr->_idx, i); adr->dump();
4073 }
4074 }
4075
ctor_5(Node * adr,Node * base,int i)4076 void SWPointer::Tracer::ctor_5(Node* adr, Node* base, int i) {
4077 if(_slp->is_trace_alignment()) {
4078 inc_depth();
4079 if (base == adr) {
4080 print_depth(); tty->print_cr(" \\ %d (adr) == %d (base) SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, base->_idx, i);
4081 } else if (!adr->is_AddP()) {
4082 print_depth(); tty->print_cr(" \\ %d (adr) is NOT Addp SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, i);
4083 }
4084 }
4085 }
4086
ctor_6(Node * mem)4087 void SWPointer::Tracer::ctor_6(Node* mem) {
4088 if(_slp->is_trace_alignment()) {
4089 //restore_depth();
4090 print_depth(); tty->print_cr(" %d (adr) SWPointer::SWPointer: stop analysis", mem->_idx);
4091 }
4092 }
4093
invariant_1(Node * n,Node * n_c) const4094 void SWPointer::Tracer::invariant_1(Node *n, Node *n_c) const {
4095 if (_slp->do_vector_loop() && _slp->is_debug() && _slp->_lpt->is_member(_slp->_phase->get_loop(n_c)) != (int)_slp->in_bb(n)) {
4096 int is_member = _slp->_lpt->is_member(_slp->_phase->get_loop(n_c));
4097 int in_bb = _slp->in_bb(n);
4098 print_depth(); tty->print(" \\ "); tty->print_cr(" %d SWPointer::invariant conditions differ: n_c %d", n->_idx, n_c->_idx);
4099 print_depth(); tty->print(" \\ "); tty->print_cr("is_member %d, in_bb %d", is_member, in_bb);
4100 print_depth(); tty->print(" \\ "); n->dump();
4101 print_depth(); tty->print(" \\ "); n_c->dump();
4102 }
4103 }
4104
scaled_iv_plus_offset_1(Node * n)4105 void SWPointer::Tracer::scaled_iv_plus_offset_1(Node* n) {
4106 if(_slp->is_trace_alignment()) {
4107 print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset testing node: ", n->_idx);
4108 n->dump();
4109 }
4110 }
4111
scaled_iv_plus_offset_2(Node * n)4112 void SWPointer::Tracer::scaled_iv_plus_offset_2(Node* n) {
4113 if(_slp->is_trace_alignment()) {
4114 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
4115 }
4116 }
4117
scaled_iv_plus_offset_3(Node * n)4118 void SWPointer::Tracer::scaled_iv_plus_offset_3(Node* n) {
4119 if(_slp->is_trace_alignment()) {
4120 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
4121 }
4122 }
4123
scaled_iv_plus_offset_4(Node * n)4124 void SWPointer::Tracer::scaled_iv_plus_offset_4(Node* n) {
4125 if(_slp->is_trace_alignment()) {
4126 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
4127 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
4128 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
4129 }
4130 }
4131
scaled_iv_plus_offset_5(Node * n)4132 void SWPointer::Tracer::scaled_iv_plus_offset_5(Node* n) {
4133 if(_slp->is_trace_alignment()) {
4134 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
4135 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
4136 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
4137 }
4138 }
4139
scaled_iv_plus_offset_6(Node * n)4140 void SWPointer::Tracer::scaled_iv_plus_offset_6(Node* n) {
4141 if(_slp->is_trace_alignment()) {
4142 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
4143 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
4144 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
4145 }
4146 }
4147
scaled_iv_plus_offset_7(Node * n)4148 void SWPointer::Tracer::scaled_iv_plus_offset_7(Node* n) {
4149 if(_slp->is_trace_alignment()) {
4150 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
4151 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
4152 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
4153 }
4154 }
4155
scaled_iv_plus_offset_8(Node * n)4156 void SWPointer::Tracer::scaled_iv_plus_offset_8(Node* n) {
4157 if(_slp->is_trace_alignment()) {
4158 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: FAILED", n->_idx);
4159 }
4160 }
4161
scaled_iv_1(Node * n)4162 void SWPointer::Tracer::scaled_iv_1(Node* n) {
4163 if(_slp->is_trace_alignment()) {
4164 print_depth(); tty->print(" %d SWPointer::scaled_iv: testing node: ", n->_idx); n->dump();
4165 }
4166 }
4167
scaled_iv_2(Node * n,int scale)4168 void SWPointer::Tracer::scaled_iv_2(Node* n, int scale) {
4169 if(_slp->is_trace_alignment()) {
4170 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED since another _scale has been detected before", n->_idx);
4171 print_depth(); tty->print_cr(" \\ SWPointer::scaled_iv: _scale (%d) != 0", scale);
4172 }
4173 }
4174
scaled_iv_3(Node * n,int scale)4175 void SWPointer::Tracer::scaled_iv_3(Node* n, int scale) {
4176 if(_slp->is_trace_alignment()) {
4177 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: is iv, setting _scale = %d", n->_idx, scale);
4178 }
4179 }
4180
scaled_iv_4(Node * n,int scale)4181 void SWPointer::Tracer::scaled_iv_4(Node* n, int scale) {
4182 if(_slp->is_trace_alignment()) {
4183 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
4184 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
4185 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
4186 }
4187 }
4188
scaled_iv_5(Node * n,int scale)4189 void SWPointer::Tracer::scaled_iv_5(Node* n, int scale) {
4190 if(_slp->is_trace_alignment()) {
4191 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
4192 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(2) is iv: ", n->in(2)->_idx); n->in(2)->dump();
4193 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
4194 }
4195 }
4196
scaled_iv_6(Node * n,int scale)4197 void SWPointer::Tracer::scaled_iv_6(Node* n, int scale) {
4198 if(_slp->is_trace_alignment()) {
4199 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftI PASSED, setting _scale = %d", n->_idx, scale);
4200 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
4201 print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
4202 }
4203 }
4204
scaled_iv_7(Node * n)4205 void SWPointer::Tracer::scaled_iv_7(Node* n) {
4206 if(_slp->is_trace_alignment()) {
4207 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_ConvI2L PASSED", n->_idx);
4208 print_depth(); tty->print_cr(" \\ SWPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset: ", n->in(1)->_idx);
4209 inc_depth(); inc_depth();
4210 print_depth(); n->in(1)->dump();
4211 dec_depth(); dec_depth();
4212 }
4213 }
4214
scaled_iv_8(Node * n,SWPointer * tmp)4215 void SWPointer::Tracer::scaled_iv_8(Node* n, SWPointer* tmp) {
4216 if(_slp->is_trace_alignment()) {
4217 print_depth(); tty->print(" %d SWPointer::scaled_iv: Op_LShiftL, creating tmp SWPointer: ", n->_idx); tmp->print();
4218 }
4219 }
4220
scaled_iv_9(Node * n,int scale,int offset,Node * invar,bool negate_invar)4221 void SWPointer::Tracer::scaled_iv_9(Node* n, int scale, int offset, Node* invar, bool negate_invar) {
4222 if(_slp->is_trace_alignment()) {
4223 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftL PASSED, setting _scale = %d, _offset = %d", n->_idx, scale, offset);
4224 print_depth(); tty->print_cr(" \\ SWPointer::scaled_iv: in(1) [%d] is scaled_iv_plus_offset, in(2) [%d] used to scale: _scale = %d, _offset = %d",
4225 n->in(1)->_idx, n->in(2)->_idx, scale, offset);
4226 if (invar != NULL) {
4227 print_depth(); tty->print_cr(" \\ SWPointer::scaled_iv: scaled invariant: %c[%d]", (negate_invar?'-':'+'), invar->_idx);
4228 }
4229 inc_depth(); inc_depth();
4230 print_depth(); n->in(1)->dump();
4231 print_depth(); n->in(2)->dump();
4232 if (invar != NULL) {
4233 print_depth(); invar->dump();
4234 }
4235 dec_depth(); dec_depth();
4236 }
4237 }
4238
scaled_iv_10(Node * n)4239 void SWPointer::Tracer::scaled_iv_10(Node* n) {
4240 if(_slp->is_trace_alignment()) {
4241 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED", n->_idx);
4242 }
4243 }
4244
offset_plus_k_1(Node * n)4245 void SWPointer::Tracer::offset_plus_k_1(Node* n) {
4246 if(_slp->is_trace_alignment()) {
4247 print_depth(); tty->print(" %d SWPointer::offset_plus_k: testing node: ", n->_idx); n->dump();
4248 }
4249 }
4250
offset_plus_k_2(Node * n,int _offset)4251 void SWPointer::Tracer::offset_plus_k_2(Node* n, int _offset) {
4252 if(_slp->is_trace_alignment()) {
4253 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConI PASSED, setting _offset = %d", n->_idx, _offset);
4254 }
4255 }
4256
offset_plus_k_3(Node * n,int _offset)4257 void SWPointer::Tracer::offset_plus_k_3(Node* n, int _offset) {
4258 if(_slp->is_trace_alignment()) {
4259 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConL PASSED, setting _offset = %d", n->_idx, _offset);
4260 }
4261 }
4262
offset_plus_k_4(Node * n)4263 void SWPointer::Tracer::offset_plus_k_4(Node* n) {
4264 if(_slp->is_trace_alignment()) {
4265 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
4266 print_depth(); tty->print_cr(" \\ " JLONG_FORMAT " SWPointer::offset_plus_k: Op_ConL FAILED, k is too big", n->get_long());
4267 }
4268 }
4269
offset_plus_k_5(Node * n,Node * _invar)4270 void SWPointer::Tracer::offset_plus_k_5(Node* n, Node* _invar) {
4271 if(_slp->is_trace_alignment()) {
4272 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED since another invariant has been detected before", n->_idx);
4273 print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: _invar != NULL: ", _invar->_idx); _invar->dump();
4274 }
4275 }
4276
offset_plus_k_6(Node * n,Node * _invar,bool _negate_invar,int _offset)4277 void SWPointer::Tracer::offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4278 if(_slp->is_trace_alignment()) {
4279 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
4280 n->_idx, _negate_invar, _invar->_idx, _offset);
4281 print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
4282 print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
4283 }
4284 }
4285
offset_plus_k_7(Node * n,Node * _invar,bool _negate_invar,int _offset)4286 void SWPointer::Tracer::offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4287 if(_slp->is_trace_alignment()) {
4288 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
4289 n->_idx, _negate_invar, _invar->_idx, _offset);
4290 print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
4291 print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
4292 }
4293 }
4294
offset_plus_k_8(Node * n,Node * _invar,bool _negate_invar,int _offset)4295 void SWPointer::Tracer::offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4296 if(_slp->is_trace_alignment()) {
4297 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI is PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
4298 n->_idx, _negate_invar, _invar->_idx, _offset);
4299 print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
4300 print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
4301 }
4302 }
4303
offset_plus_k_9(Node * n,Node * _invar,bool _negate_invar,int _offset)4304 void SWPointer::Tracer::offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4305 if(_slp->is_trace_alignment()) {
4306 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
4307 print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
4308 print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
4309 }
4310 }
4311
offset_plus_k_10(Node * n,Node * _invar,bool _negate_invar,int _offset)4312 void SWPointer::Tracer::offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4313 if(_slp->is_trace_alignment()) {
4314 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
4315 print_depth(); tty->print_cr(" \\ %d SWPointer::offset_plus_k: is invariant", n->_idx);
4316 }
4317 }
4318
offset_plus_k_11(Node * n)4319 void SWPointer::Tracer::offset_plus_k_11(Node* n) {
4320 if(_slp->is_trace_alignment()) {
4321 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
4322 }
4323 }
4324
4325 #endif
4326 // ========================= OrderedPair =====================
4327
4328 const OrderedPair OrderedPair::initial;
4329
4330 // ========================= SWNodeInfo =====================
4331
4332 const SWNodeInfo SWNodeInfo::initial;
4333
4334
4335 // ============================ DepGraph ===========================
4336
4337 //------------------------------make_node---------------------------
4338 // Make a new dependence graph node for an ideal node.
make_node(Node * node)4339 DepMem* DepGraph::make_node(Node* node) {
4340 DepMem* m = new (_arena) DepMem(node);
4341 if (node != NULL) {
4342 assert(_map.at_grow(node->_idx) == NULL, "one init only");
4343 _map.at_put_grow(node->_idx, m);
4344 }
4345 return m;
4346 }
4347
4348 //------------------------------make_edge---------------------------
4349 // Make a new dependence graph edge from dpred -> dsucc
make_edge(DepMem * dpred,DepMem * dsucc)4350 DepEdge* DepGraph::make_edge(DepMem* dpred, DepMem* dsucc) {
4351 DepEdge* e = new (_arena) DepEdge(dpred, dsucc, dsucc->in_head(), dpred->out_head());
4352 dpred->set_out_head(e);
4353 dsucc->set_in_head(e);
4354 return e;
4355 }
4356
4357 // ========================== DepMem ========================
4358
4359 //------------------------------in_cnt---------------------------
in_cnt()4360 int DepMem::in_cnt() {
4361 int ct = 0;
4362 for (DepEdge* e = _in_head; e != NULL; e = e->next_in()) ct++;
4363 return ct;
4364 }
4365
4366 //------------------------------out_cnt---------------------------
out_cnt()4367 int DepMem::out_cnt() {
4368 int ct = 0;
4369 for (DepEdge* e = _out_head; e != NULL; e = e->next_out()) ct++;
4370 return ct;
4371 }
4372
4373 //------------------------------print-----------------------------
print()4374 void DepMem::print() {
4375 #ifndef PRODUCT
4376 tty->print(" DepNode %d (", _node->_idx);
4377 for (DepEdge* p = _in_head; p != NULL; p = p->next_in()) {
4378 Node* pred = p->pred()->node();
4379 tty->print(" %d", pred != NULL ? pred->_idx : 0);
4380 }
4381 tty->print(") [");
4382 for (DepEdge* s = _out_head; s != NULL; s = s->next_out()) {
4383 Node* succ = s->succ()->node();
4384 tty->print(" %d", succ != NULL ? succ->_idx : 0);
4385 }
4386 tty->print_cr(" ]");
4387 #endif
4388 }
4389
4390 // =========================== DepEdge =========================
4391
4392 //------------------------------DepPreds---------------------------
print()4393 void DepEdge::print() {
4394 #ifndef PRODUCT
4395 tty->print_cr("DepEdge: %d [ %d ]", _pred->node()->_idx, _succ->node()->_idx);
4396 #endif
4397 }
4398
4399 // =========================== DepPreds =========================
4400 // Iterator over predecessor edges in the dependence graph.
4401
4402 //------------------------------DepPreds---------------------------
DepPreds(Node * n,DepGraph & dg)4403 DepPreds::DepPreds(Node* n, DepGraph& dg) {
4404 _n = n;
4405 _done = false;
4406 if (_n->is_Store() || _n->is_Load()) {
4407 _next_idx = MemNode::Address;
4408 _end_idx = n->req();
4409 _dep_next = dg.dep(_n)->in_head();
4410 } else if (_n->is_Mem()) {
4411 _next_idx = 0;
4412 _end_idx = 0;
4413 _dep_next = dg.dep(_n)->in_head();
4414 } else {
4415 _next_idx = 1;
4416 _end_idx = _n->req();
4417 _dep_next = NULL;
4418 }
4419 next();
4420 }
4421
4422 //------------------------------next---------------------------
next()4423 void DepPreds::next() {
4424 if (_dep_next != NULL) {
4425 _current = _dep_next->pred()->node();
4426 _dep_next = _dep_next->next_in();
4427 } else if (_next_idx < _end_idx) {
4428 _current = _n->in(_next_idx++);
4429 } else {
4430 _done = true;
4431 }
4432 }
4433
4434 // =========================== DepSuccs =========================
4435 // Iterator over successor edges in the dependence graph.
4436
4437 //------------------------------DepSuccs---------------------------
DepSuccs(Node * n,DepGraph & dg)4438 DepSuccs::DepSuccs(Node* n, DepGraph& dg) {
4439 _n = n;
4440 _done = false;
4441 if (_n->is_Load()) {
4442 _next_idx = 0;
4443 _end_idx = _n->outcnt();
4444 _dep_next = dg.dep(_n)->out_head();
4445 } else if (_n->is_Mem() || (_n->is_Phi() && _n->bottom_type() == Type::MEMORY)) {
4446 _next_idx = 0;
4447 _end_idx = 0;
4448 _dep_next = dg.dep(_n)->out_head();
4449 } else {
4450 _next_idx = 0;
4451 _end_idx = _n->outcnt();
4452 _dep_next = NULL;
4453 }
4454 next();
4455 }
4456
4457 //-------------------------------next---------------------------
next()4458 void DepSuccs::next() {
4459 if (_dep_next != NULL) {
4460 _current = _dep_next->succ()->node();
4461 _dep_next = _dep_next->next_out();
4462 } else if (_next_idx < _end_idx) {
4463 _current = _n->raw_out(_next_idx++);
4464 } else {
4465 _done = true;
4466 }
4467 }
4468
4469 //
4470 // --------------------------------- vectorization/simd -----------------------------------
4471 //
same_origin_idx(Node * a,Node * b) const4472 bool SuperWord::same_origin_idx(Node* a, Node* b) const {
4473 return a != NULL && b != NULL && _clone_map.same_idx(a->_idx, b->_idx);
4474 }
same_generation(Node * a,Node * b) const4475 bool SuperWord::same_generation(Node* a, Node* b) const {
4476 return a != NULL && b != NULL && _clone_map.same_gen(a->_idx, b->_idx);
4477 }
4478
find_phi_for_mem_dep(LoadNode * ld)4479 Node* SuperWord::find_phi_for_mem_dep(LoadNode* ld) {
4480 assert(in_bb(ld), "must be in block");
4481 if (_clone_map.gen(ld->_idx) == _ii_first) {
4482 #ifndef PRODUCT
4483 if (_vector_loop_debug) {
4484 tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(ld->_idx)=%d",
4485 _clone_map.gen(ld->_idx));
4486 }
4487 #endif
4488 return NULL; //we think that any ld in the first gen being vectorizable
4489 }
4490
4491 Node* mem = ld->in(MemNode::Memory);
4492 if (mem->outcnt() <= 1) {
4493 // we don't want to remove the only edge from mem node to load
4494 #ifndef PRODUCT
4495 if (_vector_loop_debug) {
4496 tty->print_cr("SuperWord::find_phi_for_mem_dep input node %d to load %d has no other outputs and edge mem->load cannot be removed",
4497 mem->_idx, ld->_idx);
4498 ld->dump();
4499 mem->dump();
4500 }
4501 #endif
4502 return NULL;
4503 }
4504 if (!in_bb(mem) || same_generation(mem, ld)) {
4505 #ifndef PRODUCT
4506 if (_vector_loop_debug) {
4507 tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(mem->_idx)=%d",
4508 _clone_map.gen(mem->_idx));
4509 }
4510 #endif
4511 return NULL; // does not depend on loop volatile node or depends on the same generation
4512 }
4513
4514 //otherwise first node should depend on mem-phi
4515 Node* first = first_node(ld);
4516 assert(first->is_Load(), "must be Load");
4517 Node* phi = first->as_Load()->in(MemNode::Memory);
4518 if (!phi->is_Phi() || phi->bottom_type() != Type::MEMORY) {
4519 #ifndef PRODUCT
4520 if (_vector_loop_debug) {
4521 tty->print_cr("SuperWord::find_phi_for_mem_dep load is not vectorizable node, since it's `first` does not take input from mem phi");
4522 ld->dump();
4523 first->dump();
4524 }
4525 #endif
4526 return NULL;
4527 }
4528
4529 Node* tail = 0;
4530 for (int m = 0; m < _mem_slice_head.length(); m++) {
4531 if (_mem_slice_head.at(m) == phi) {
4532 tail = _mem_slice_tail.at(m);
4533 }
4534 }
4535 if (tail == 0) { //test that found phi is in the list _mem_slice_head
4536 #ifndef PRODUCT
4537 if (_vector_loop_debug) {
4538 tty->print_cr("SuperWord::find_phi_for_mem_dep load %d is not vectorizable node, its phi %d is not _mem_slice_head",
4539 ld->_idx, phi->_idx);
4540 ld->dump();
4541 phi->dump();
4542 }
4543 #endif
4544 return NULL;
4545 }
4546
4547 // now all conditions are met
4548 return phi;
4549 }
4550
first_node(Node * nd)4551 Node* SuperWord::first_node(Node* nd) {
4552 for (int ii = 0; ii < _iteration_first.length(); ii++) {
4553 Node* nnn = _iteration_first.at(ii);
4554 if (same_origin_idx(nnn, nd)) {
4555 #ifndef PRODUCT
4556 if (_vector_loop_debug) {
4557 tty->print_cr("SuperWord::first_node: %d is the first iteration node for %d (_clone_map.idx(nnn->_idx) = %d)",
4558 nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx));
4559 }
4560 #endif
4561 return nnn;
4562 }
4563 }
4564
4565 #ifndef PRODUCT
4566 if (_vector_loop_debug) {
4567 tty->print_cr("SuperWord::first_node: did not find first iteration node for %d (_clone_map.idx(nd->_idx)=%d)",
4568 nd->_idx, _clone_map.idx(nd->_idx));
4569 }
4570 #endif
4571 return 0;
4572 }
4573
last_node(Node * nd)4574 Node* SuperWord::last_node(Node* nd) {
4575 for (int ii = 0; ii < _iteration_last.length(); ii++) {
4576 Node* nnn = _iteration_last.at(ii);
4577 if (same_origin_idx(nnn, nd)) {
4578 #ifndef PRODUCT
4579 if (_vector_loop_debug) {
4580 tty->print_cr("SuperWord::last_node _clone_map.idx(nnn->_idx)=%d, _clone_map.idx(nd->_idx)=%d",
4581 _clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx));
4582 }
4583 #endif
4584 return nnn;
4585 }
4586 }
4587 return 0;
4588 }
4589
mark_generations()4590 int SuperWord::mark_generations() {
4591 Node *ii_err = NULL, *tail_err = NULL;
4592 for (int i = 0; i < _mem_slice_head.length(); i++) {
4593 Node* phi = _mem_slice_head.at(i);
4594 assert(phi->is_Phi(), "must be phi");
4595
4596 Node* tail = _mem_slice_tail.at(i);
4597 if (_ii_last == -1) {
4598 tail_err = tail;
4599 _ii_last = _clone_map.gen(tail->_idx);
4600 }
4601 else if (_ii_last != _clone_map.gen(tail->_idx)) {
4602 #ifndef PRODUCT
4603 if (TraceSuperWord && Verbose) {
4604 tty->print_cr("SuperWord::mark_generations _ii_last error - found different generations in two tail nodes ");
4605 tail->dump();
4606 tail_err->dump();
4607 }
4608 #endif
4609 return -1;
4610 }
4611
4612 // find first iteration in the loop
4613 for (DUIterator_Fast imax, i = phi->fast_outs(imax); i < imax; i++) {
4614 Node* ii = phi->fast_out(i);
4615 if (in_bb(ii) && ii->is_Store()) { // we speculate that normally Stores of one and one only generation have deps from mem phi
4616 if (_ii_first == -1) {
4617 ii_err = ii;
4618 _ii_first = _clone_map.gen(ii->_idx);
4619 } else if (_ii_first != _clone_map.gen(ii->_idx)) {
4620 #ifndef PRODUCT
4621 if (TraceSuperWord && Verbose) {
4622 tty->print_cr("SuperWord::mark_generations: _ii_first was found before and not equal to one in this node (%d)", _ii_first);
4623 ii->dump();
4624 if (ii_err!= 0) {
4625 ii_err->dump();
4626 }
4627 }
4628 #endif
4629 return -1; // this phi has Stores from different generations of unroll and cannot be simd/vectorized
4630 }
4631 }
4632 }//for (DUIterator_Fast imax,
4633 }//for (int i...
4634
4635 if (_ii_first == -1 || _ii_last == -1) {
4636 if (TraceSuperWord && Verbose) {
4637 tty->print_cr("SuperWord::mark_generations unknown error, something vent wrong");
4638 }
4639 return -1; // something vent wrong
4640 }
4641 // collect nodes in the first and last generations
4642 assert(_iteration_first.length() == 0, "_iteration_first must be empty");
4643 assert(_iteration_last.length() == 0, "_iteration_last must be empty");
4644 for (int j = 0; j < _block.length(); j++) {
4645 Node* n = _block.at(j);
4646 node_idx_t gen = _clone_map.gen(n->_idx);
4647 if ((signed)gen == _ii_first) {
4648 _iteration_first.push(n);
4649 } else if ((signed)gen == _ii_last) {
4650 _iteration_last.push(n);
4651 }
4652 }
4653
4654 // building order of iterations
4655 if (_ii_order.length() == 0 && ii_err != 0) {
4656 assert(in_bb(ii_err) && ii_err->is_Store(), "should be Store in bb");
4657 Node* nd = ii_err;
4658 while(_clone_map.gen(nd->_idx) != _ii_last) {
4659 _ii_order.push(_clone_map.gen(nd->_idx));
4660 bool found = false;
4661 for (DUIterator_Fast imax, i = nd->fast_outs(imax); i < imax; i++) {
4662 Node* use = nd->fast_out(i);
4663 if (same_origin_idx(use, nd) && use->as_Store()->in(MemNode::Memory) == nd) {
4664 found = true;
4665 nd = use;
4666 break;
4667 }
4668 }//for
4669
4670 if (found == false) {
4671 if (TraceSuperWord && Verbose) {
4672 tty->print_cr("SuperWord::mark_generations: Cannot build order of iterations - no dependent Store for %d", nd->_idx);
4673 }
4674 _ii_order.clear();
4675 return -1;
4676 }
4677 } //while
4678 _ii_order.push(_clone_map.gen(nd->_idx));
4679 }
4680
4681 #ifndef PRODUCT
4682 if (_vector_loop_debug) {
4683 tty->print_cr("SuperWord::mark_generations");
4684 tty->print_cr("First generation (%d) nodes:", _ii_first);
4685 for (int ii = 0; ii < _iteration_first.length(); ii++) _iteration_first.at(ii)->dump();
4686 tty->print_cr("Last generation (%d) nodes:", _ii_last);
4687 for (int ii = 0; ii < _iteration_last.length(); ii++) _iteration_last.at(ii)->dump();
4688 tty->print_cr(" ");
4689
4690 tty->print("SuperWord::List of generations: ");
4691 for (int jj = 0; jj < _ii_order.length(); ++jj) {
4692 tty->print("%d:%d ", jj, _ii_order.at(jj));
4693 }
4694 tty->print_cr(" ");
4695 }
4696 #endif
4697
4698 return _ii_first;
4699 }
4700
fix_commutative_inputs(Node * gold,Node * fix)4701 bool SuperWord::fix_commutative_inputs(Node* gold, Node* fix) {
4702 assert(gold->is_Add() && fix->is_Add() || gold->is_Mul() && fix->is_Mul(), "should be only Add or Mul nodes");
4703 assert(same_origin_idx(gold, fix), "should be clones of the same node");
4704 Node* gin1 = gold->in(1);
4705 Node* gin2 = gold->in(2);
4706 Node* fin1 = fix->in(1);
4707 Node* fin2 = fix->in(2);
4708 bool swapped = false;
4709
4710 if (in_bb(gin1) && in_bb(gin2) && in_bb(fin1) && in_bb(fin1)) {
4711 if (same_origin_idx(gin1, fin1) &&
4712 same_origin_idx(gin2, fin2)) {
4713 return true; // nothing to fix
4714 }
4715 if (same_origin_idx(gin1, fin2) &&
4716 same_origin_idx(gin2, fin1)) {
4717 fix->swap_edges(1, 2);
4718 swapped = true;
4719 }
4720 }
4721 // at least one input comes from outside of bb
4722 if (gin1->_idx == fin1->_idx) {
4723 return true; // nothing to fix
4724 }
4725 if (!swapped && (gin1->_idx == fin2->_idx || gin2->_idx == fin1->_idx)) { //swapping is expensive, check condition first
4726 fix->swap_edges(1, 2);
4727 swapped = true;
4728 }
4729
4730 if (swapped) {
4731 #ifndef PRODUCT
4732 if (_vector_loop_debug) {
4733 tty->print_cr("SuperWord::fix_commutative_inputs: fixed node %d", fix->_idx);
4734 }
4735 #endif
4736 return true;
4737 }
4738
4739 if (TraceSuperWord && Verbose) {
4740 tty->print_cr("SuperWord::fix_commutative_inputs: cannot fix node %d", fix->_idx);
4741 }
4742
4743 return false;
4744 }
4745
pack_parallel()4746 bool SuperWord::pack_parallel() {
4747 #ifndef PRODUCT
4748 if (_vector_loop_debug) {
4749 tty->print_cr("SuperWord::pack_parallel: START");
4750 }
4751 #endif
4752
4753 _packset.clear();
4754
4755 if (_ii_order.is_empty()) {
4756 #ifndef PRODUCT
4757 if (_vector_loop_debug) {
4758 tty->print_cr("SuperWord::pack_parallel: EMPTY");
4759 }
4760 #endif
4761 return false;
4762 }
4763
4764 for (int ii = 0; ii < _iteration_first.length(); ii++) {
4765 Node* nd = _iteration_first.at(ii);
4766 if (in_bb(nd) && (nd->is_Load() || nd->is_Store() || nd->is_Add() || nd->is_Mul())) {
4767 Node_List* pk = new Node_List();
4768 pk->push(nd);
4769 for (int gen = 1; gen < _ii_order.length(); ++gen) {
4770 for (int kk = 0; kk < _block.length(); kk++) {
4771 Node* clone = _block.at(kk);
4772 if (same_origin_idx(clone, nd) &&
4773 _clone_map.gen(clone->_idx) == _ii_order.at(gen)) {
4774 if (nd->is_Add() || nd->is_Mul()) {
4775 fix_commutative_inputs(nd, clone);
4776 }
4777 pk->push(clone);
4778 if (pk->size() == 4) {
4779 _packset.append(pk);
4780 #ifndef PRODUCT
4781 if (_vector_loop_debug) {
4782 tty->print_cr("SuperWord::pack_parallel: added pack ");
4783 pk->dump();
4784 }
4785 #endif
4786 if (_clone_map.gen(clone->_idx) != _ii_last) {
4787 pk = new Node_List();
4788 }
4789 }
4790 break;
4791 }
4792 }
4793 }//for
4794 }//if
4795 }//for
4796
4797 #ifndef PRODUCT
4798 if (_vector_loop_debug) {
4799 tty->print_cr("SuperWord::pack_parallel: END");
4800 }
4801 #endif
4802
4803 return true;
4804 }
4805
hoist_loads_in_graph()4806 bool SuperWord::hoist_loads_in_graph() {
4807 GrowableArray<Node*> loads;
4808
4809 #ifndef PRODUCT
4810 if (_vector_loop_debug) {
4811 tty->print_cr("SuperWord::hoist_loads_in_graph: total number _mem_slice_head.length() = %d", _mem_slice_head.length());
4812 }
4813 #endif
4814
4815 for (int i = 0; i < _mem_slice_head.length(); i++) {
4816 Node* n = _mem_slice_head.at(i);
4817 if ( !in_bb(n) || !n->is_Phi() || n->bottom_type() != Type::MEMORY) {
4818 if (TraceSuperWord && Verbose) {
4819 tty->print_cr("SuperWord::hoist_loads_in_graph: skipping unexpected node n=%d", n->_idx);
4820 }
4821 continue;
4822 }
4823
4824 #ifndef PRODUCT
4825 if (_vector_loop_debug) {
4826 tty->print_cr("SuperWord::hoist_loads_in_graph: processing phi %d = _mem_slice_head.at(%d);", n->_idx, i);
4827 }
4828 #endif
4829
4830 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
4831 Node* ld = n->fast_out(i);
4832 if (ld->is_Load() && ld->as_Load()->in(MemNode::Memory) == n && in_bb(ld)) {
4833 for (int i = 0; i < _block.length(); i++) {
4834 Node* ld2 = _block.at(i);
4835 if (ld2->is_Load() && same_origin_idx(ld, ld2) &&
4836 !same_generation(ld, ld2)) { // <= do not collect the first generation ld
4837 #ifndef PRODUCT
4838 if (_vector_loop_debug) {
4839 tty->print_cr("SuperWord::hoist_loads_in_graph: will try to hoist load ld2->_idx=%d, cloned from %d (ld->_idx=%d)",
4840 ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx);
4841 }
4842 #endif
4843 // could not do on-the-fly, since iterator is immutable
4844 loads.push(ld2);
4845 }
4846 }// for
4847 }//if
4848 }//for (DUIterator_Fast imax,
4849 }//for (int i = 0; i
4850
4851 for (int i = 0; i < loads.length(); i++) {
4852 LoadNode* ld = loads.at(i)->as_Load();
4853 Node* phi = find_phi_for_mem_dep(ld);
4854 if (phi != NULL) {
4855 #ifndef PRODUCT
4856 if (_vector_loop_debug) {
4857 tty->print_cr("SuperWord::hoist_loads_in_graph replacing MemNode::Memory(%d) edge in %d with one from %d",
4858 MemNode::Memory, ld->_idx, phi->_idx);
4859 }
4860 #endif
4861 _igvn.replace_input_of(ld, MemNode::Memory, phi);
4862 }
4863 }//for
4864
4865 restart(); // invalidate all basic structures, since we rebuilt the graph
4866
4867 if (TraceSuperWord && Verbose) {
4868 tty->print_cr("\nSuperWord::hoist_loads_in_graph() the graph was rebuilt, all structures invalidated and need rebuild");
4869 }
4870
4871 return true;
4872 }
4873