1 // Copyright Contributors to the Open Shading Language project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
4 
5 #include <vector>
6 #include <cstdio>
7 #include <cmath>
8 
9 #include <OpenImageIO/sysutil.h>
10 #include <OpenImageIO/timer.h>
11 #include <OpenImageIO/thread.h>
12 
13 #include "oslexec_pvt.h"
14 #include "runtimeoptimize.h"
15 #include "../liboslcomp/oslcomp_pvt.h"
16 using namespace OSL;
17 using namespace OSL::pvt;
18 
19 
20 // names of ops we'll be using frequently
21 static ustring u_nop    ("nop"),
22                u_exit   ("exit"),
23                u_assign ("assign"),
24                u_add    ("add"),
25                u_sub    ("sub"),
26                u_mul    ("mul"),
27                u_if     ("if"),
28                u_for    ("for"),
29                u_while  ("while"),
30                u_dowhile("dowhile"),
31                u_functioncall ("functioncall"),
32                u_functioncall_nr("functioncall_nr"),
33                u_break ("break"),
34                u_continue ("continue"),
35                u_return ("return"),
36                u_useparam ("useparam"),
37                u_closure ("closure"),
38                u_pointcloud_write ("pointcloud_write"),
39                u_isconnected ("isconnected"),
40                u_setmessage ("setmessage"),
41                u_getmessage ("getmessage"),
42                u_getattribute ("getattribute");
43 
44 
45 OSL_NAMESPACE_ENTER
46 
47 namespace pvt {   // OSL::pvt
48 
49 using OIIO::spin_lock;
50 using OIIO::Timer;
51 
52 DECLFOLDER(constfold_assign);  // forward decl
53 
54 
55 
56 /// Wrapper that erases elements of c for which predicate p is true.
57 /// (Unlike std::remove_if, it resizes the container so that it contains
58 /// ONLY elements for which the predicate is true.)
59 template<class Container, class Predicate>
erase_if(Container & c,const Predicate & p)60 void erase_if (Container &c, const Predicate &p)
61 {
62     c.erase (std::remove_if (c.begin(), c.end(), p), c.end());
63 }
64 
65 
66 
OSOProcessorBase(ShadingSystemImpl & shadingsys,ShaderGroup & group,ShadingContext * ctx)67 OSOProcessorBase::OSOProcessorBase (ShadingSystemImpl &shadingsys,
68                                     ShaderGroup &group, ShadingContext *ctx)
69     : m_shadingsys(shadingsys),
70       m_group(group),
71       m_context(ctx),
72       m_debug(shadingsys.debug()),
73       m_inst(NULL)
74 {
75     set_debug ();
76 }
77 
78 
79 
~OSOProcessorBase()80 OSOProcessorBase::~OSOProcessorBase ()
81 {
82 }
83 
84 
85 
RuntimeOptimizer(ShadingSystemImpl & shadingsys,ShaderGroup & group,ShadingContext * ctx)86 RuntimeOptimizer::RuntimeOptimizer (ShadingSystemImpl &shadingsys,
87                                     ShaderGroup &group, ShadingContext *ctx)
88     : OSOProcessorBase(shadingsys, group, ctx),
89       m_optimize(shadingsys.optimize()),
90       m_opt_simplify_param(shadingsys.m_opt_simplify_param),
91       m_opt_constant_fold(shadingsys.m_opt_constant_fold),
92       m_opt_stale_assign(shadingsys.m_opt_stale_assign),
93       m_opt_elide_useless_ops(shadingsys.m_opt_elide_useless_ops),
94       m_opt_elide_unconnected_outputs(shadingsys.m_opt_elide_unconnected_outputs),
95       m_opt_peephole(shadingsys.m_opt_peephole),
96       m_opt_coalesce_temps(shadingsys.m_opt_coalesce_temps),
97       m_opt_assign(shadingsys.m_opt_assign),
98       m_opt_mix(shadingsys.m_opt_mix),
99       m_opt_middleman(shadingsys.m_opt_middleman),
100       m_keep_no_return_function_calls(shadingsys.m_llvm_debugging_symbols),
101       m_pass(0),
102       m_next_newconst(0), m_next_newtemp(0),
103       m_stat_opt_locking_time(0), m_stat_specialization_time(0),
104       m_stop_optimizing(false),
105       m_raytypes_on(group.raytypes_on()), m_raytypes_off(group.raytypes_off())
106 {
107     memset ((char *)&m_shaderglobals, 0, sizeof(ShaderGlobals));
108     m_shaderglobals.context = shadingcontext();
109 
110     // Disable no_function_return_calls for OptiX renderers, because we
111     // aren't yet set up to support use of debugging symbols for PTX.
112     // FIXME: some day, we are going to want debugging symbols for PTX, and
113     // will need some refactoring of the debugging symbol code.
114     if (shadingsys.renderer()->supports("OptiX"))
115         m_keep_no_return_function_calls = false;
116 }
117 
118 
119 
~RuntimeOptimizer()120 RuntimeOptimizer::~RuntimeOptimizer ()
121 {
122 }
123 
124 
125 
126 void
set_inst(int newlayer)127 OSOProcessorBase::set_inst (int newlayer)
128 {
129     m_layer = newlayer;
130     m_inst = group()[m_layer];
131     OSL_DASSERT (m_inst != NULL);
132     set_debug ();
133 }
134 
135 
136 
137 void
set_inst(int newlayer)138 RuntimeOptimizer::set_inst (int newlayer)
139 {
140     OSOProcessorBase::set_inst (newlayer);
141     m_all_consts.clear ();
142     m_symbol_aliases.clear ();
143     m_block_aliases.clear ();
144     m_param_aliases.clear ();
145     m_bblockids.clear ();
146 }
147 
148 
149 
150 void
set_debug()151 OSOProcessorBase::set_debug ()
152 {
153     // start with the shading system's idea of debugging level
154     m_debug = shadingsys().debug();
155 
156     // If either group or layer was specified for debug, surely they want
157     // debugging turned on.
158     if (!shadingsys().debug_groupname().empty() || !shadingsys().debug_layername().empty())
159         m_debug = std::max (m_debug, 1);
160 
161     // Force debugging off if a specific group was selected for debug
162     // and we're not it, or a specific layer was selected for debug and
163     // we're not it.
164     bool wronggroup = (!shadingsys().debug_groupname().empty() &&
165                        shadingsys().debug_groupname() != group().name());
166     bool wronglayer = (!shadingsys().debug_layername().empty() && inst() &&
167                        shadingsys().debug_layername() != inst()->layername());
168     if (wronggroup || wronglayer)
169         m_debug = 0;
170 }
171 
172 
173 
174 void
set_debug()175 RuntimeOptimizer::set_debug ()
176 {
177     OSOProcessorBase::set_debug ();
178 
179     // If a specific group is isolated for debugging and  the
180     // 'optimize_dondebug' flag is on, fully optimize all other groups.
181     if (!shadingsys().debug_groupname().empty() &&
182         shadingsys().debug_groupname() != group().name()) {
183         if (shadingsys().m_optimize_nondebug) {
184             // Debugging trick: if user said to only debug one group, turn
185             // on full optimization for all others!  This prevents
186             // everything from running 10x slower just because you want to
187             // debug one shader.
188             m_optimize = 3;
189             m_opt_simplify_param = true;
190             m_opt_constant_fold = true;
191             m_opt_stale_assign = true;
192             m_opt_elide_useless_ops = true;
193             m_opt_elide_unconnected_outputs = true;
194             m_opt_peephole = true;
195             m_opt_coalesce_temps = true;
196             m_opt_assign = true;
197             m_opt_mix = true;
198             m_opt_middleman = true;
199         }
200     }
201 }
202 
203 
204 
205 int
find_constant(const TypeSpec & type,const void * data)206 RuntimeOptimizer::find_constant (const TypeSpec &type, const void *data)
207 {
208     for (int c : m_all_consts) {
209         const Symbol &s (*inst()->symbol(c));
210         OSL_DASSERT (s.symtype() == SymTypeConst);
211         if (equivalent (s.typespec(), type) &&
212               !memcmp (s.data(), data, s.typespec().simpletype().size())) {
213             return c;
214         }
215     }
216     return -1;
217 }
218 
219 
220 
221 int
add_constant(const TypeSpec & type,const void * data,TypeDesc datatype)222 RuntimeOptimizer::add_constant (const TypeSpec &type, const void *data,
223                                 TypeDesc datatype)
224 {
225     int ind = find_constant (type, data);
226     if (ind < 0) {
227         // support varlen arrays
228         TypeSpec newtype = type;
229         if (type.is_unsized_array())
230             newtype.make_array (datatype.numelements());
231 
232         Symbol newconst (ustring::sprintf ("$newconst%d", m_next_newconst++),
233                          newtype, SymTypeConst);
234         void *newdata = nullptr;
235         TypeDesc t (newtype.simpletype());
236         size_t n = t.aggregate * t.numelements();
237         if (datatype == TypeDesc::UNKNOWN)
238             datatype = t;
239         size_t datan = datatype.aggregate * datatype.numelements();
240         if (t.basetype == TypeDesc::INT &&
241                 datatype.basetype == TypeDesc::INT && n == datan) {
242             newdata = inst()->shadingsys().alloc_int_constants (n);
243             memcpy (newdata, data, t.size());
244         } else if (t.basetype == TypeDesc::FLOAT &&
245                    datatype.basetype == TypeDesc::FLOAT) {
246             newdata = inst()->shadingsys().alloc_float_constants (n);
247             if (n == datan)
248                 for (size_t i = 0;  i < n;  ++i)
249                     ((float *)newdata)[i] = ((const float *)data)[i];
250             else if (datan == 1)
251                 for (size_t i = 0;  i < n;  ++i)
252                     ((float *)newdata)[i] = ((const float *)data)[0];
253             else {
254                 OSL_ASSERT (0 && "unsupported type for add_constant");
255             }
256         } else if (t.basetype == TypeDesc::FLOAT &&
257                    datatype.basetype == TypeDesc::INT) {
258             newdata = inst()->shadingsys().alloc_float_constants (n);
259             if (n == datan)
260                 for (size_t i = 0;  i < n;  ++i)
261                     ((float *)newdata)[i] = ((const int *)data)[i];
262             else if (datan == 1)
263                 for (size_t i = 0;  i < n;  ++i)
264                     ((float *)newdata)[i] = ((const int *)data)[0];
265             else {
266                 OSL_ASSERT (0 && "unsupported type for add_constant");
267             }
268         } else if (t.basetype == TypeDesc::STRING &&
269                    datatype.basetype == TypeDesc::STRING && n == datan) {
270             newdata = inst()->shadingsys().alloc_string_constants (n);
271             memcpy (newdata, data, t.size());
272         } else {
273             OSL_ASSERT (0 && "unsupported type for add_constant");
274         }
275         newconst.data (newdata);
276         ind = add_symbol (newconst);
277         m_all_consts.push_back (ind);
278     }
279     return ind;
280 }
281 
282 
283 
284 int
add_temp(const TypeSpec & type)285 RuntimeOptimizer::add_temp (const TypeSpec &type)
286 {
287     return add_symbol (Symbol (ustring::sprintf ("$opttemp%d", m_next_newtemp++),
288                                type, SymTypeTemp));
289 }
290 
291 
292 
293 int
add_global(ustring name,const TypeSpec & type)294 RuntimeOptimizer::add_global (ustring name, const TypeSpec &type)
295 {
296     int index = inst()->findsymbol (name);
297     if (index < 0)
298         index = add_symbol (Symbol (name, type, SymTypeGlobal));
299     return index;
300 }
301 
302 
303 
304 int
add_symbol(const Symbol & sym)305 RuntimeOptimizer::add_symbol (const Symbol &sym)
306 {
307     size_t index = inst()->symbols().size ();
308     OSL_ASSERT (inst()->symbols().capacity() > index &&
309                 "we shouldn't have to realloc here");
310     inst()->symbols().push_back (sym);
311     // Mark the symbol as always read.  Next time we recompute symbol
312     // lifetimes, it'll get the correct range for when it's read and
313     // written.  But for now, just make sure it doesn't accidentally
314     // look entirely unused.
315     inst()->symbols().back().mark_always_used ();
316     return (int) index;
317 }
318 
319 
320 
321 void
debug_opt_impl(string_view message) const322 RuntimeOptimizer::debug_opt_impl (string_view message) const
323 {
324     static OIIO::spin_mutex mutex;
325     OIIO::spin_lock lock (mutex);
326     std::cout << message;
327 }
328 
329 
330 
331 void
debug_opt_ops(int opbegin,int opend,string_view message) const332 RuntimeOptimizer::debug_opt_ops (int opbegin, int opend, string_view message) const
333 {
334     const Opcode &op (inst()->ops()[opbegin]);
335     std::string oprange;
336     if (opbegin >= 0 && opend-opbegin > 1)
337         oprange = Strutil::sprintf ("ops %d-%d ", opbegin, opend);
338     else if (opbegin >= 0)
339         oprange = Strutil::sprintf ("op %d ", opbegin);
340     debug_optf("  %s%s (@ %s:%d)\n", oprange, message,
341                op.sourcefile(), op.sourceline());
342 }
343 
344 
345 
346 void
debug_turn_into(const Opcode & op,int numops,string_view newop,int newarg0,int newarg1,int newarg2,string_view why)347 RuntimeOptimizer::debug_turn_into (const Opcode &op, int numops,
348                                    string_view newop,
349                                    int newarg0, int newarg1, int newarg2,
350                                    string_view why)
351 {
352     int opnum = &op - &(inst()->ops()[0]);
353     std::string msg;
354     if (numops == 1)
355         msg = Strutil::sprintf ("turned '%s' to '%s", op_string(op), newop);
356     else
357         msg = Strutil::sprintf ("turned to '%s", newop);
358     if (newarg0 >= 0)
359         msg += Strutil::sprintf (" %s", inst()->symbol(newarg0)->name());
360     if (newarg1 >= 0)
361         msg += Strutil::sprintf (" %s", inst()->symbol(newarg1)->name());
362     if (newarg2 >= 0)
363         msg += Strutil::sprintf (" %s", inst()->symbol(newarg2)->name());
364     msg += "'";
365     if (why.size())
366         msg += Strutil::sprintf (" : %s", why);
367     debug_opt_ops (opnum, opnum+numops, msg);
368 }
369 
370 
371 
372 void
turn_into_new_op(Opcode & op,ustring newop,int newarg0,int newarg1,int newarg2,string_view why)373 RuntimeOptimizer::turn_into_new_op (Opcode &op, ustring newop, int newarg0,
374                                     int newarg1, int newarg2, string_view why)
375 {
376     int opnum = &op - &(inst()->ops()[0]);
377     OSL_DASSERT(opnum >= 0 && opnum < (int)inst()->ops().size());
378     if (debug() > 1)
379         debug_turn_into (op, 1, newop, newarg0, newarg1, newarg2, why);
380     op.reset (newop, newarg2<0 ? 2 : 3);
381     inst()->args()[op.firstarg()+0] = newarg0;
382     op.argwriteonly (0);
383     opargsym(op, 0)->mark_rw (opnum, false, true);
384     inst()->args()[op.firstarg()+1] = newarg1;
385     op.argreadonly (1);
386     opargsym(op, 1)->mark_rw (opnum, true, false);
387     if (newarg2 >= 0) {
388         inst()->args()[op.firstarg()+2] = newarg2;
389         op.argreadonly (2);
390         opargsym(op, 2)->mark_rw (opnum, true, false);
391     }
392 }
393 
394 
395 
396 void
turn_into_assign(Opcode & op,int newarg,string_view why)397 RuntimeOptimizer::turn_into_assign (Opcode &op, int newarg, string_view why)
398 {
399     // We don't know the op num here, so we subtract the pointers
400     int opnum = &op - &(inst()->ops()[0]);
401     if (debug() > 1)
402         debug_turn_into (op, 1, "assign", oparg(op,0), newarg, -1, why);
403     op.reset (u_assign, 2);
404     inst()->args()[op.firstarg()+1] = newarg;
405     op.argwriteonly (0);
406     op.argread (1, true);
407     op.argwrite (1, false);
408     // Need to make sure the symbol we're assigning is marked as read
409     // for this op.
410     OSL_DASSERT(opnum >= 0 && opnum < (int)inst()->ops().size());
411     Symbol *arg = opargsym (op, 1);
412     arg->mark_rw (opnum, true, false);
413 }
414 
415 
416 
417 // Turn the current op into a simple assignment to zero (of the first arg).
418 void
turn_into_assign_zero(Opcode & op,string_view why)419 RuntimeOptimizer::turn_into_assign_zero (Opcode &op, string_view why)
420 {
421     static float zero[16] = { 0, 0, 0, 0,  0, 0, 0, 0,
422                               0, 0, 0, 0,  0, 0, 0, 0 };
423     Symbol &R (*(inst()->argsymbol(op.firstarg()+0)));
424     int cind = add_constant (R.typespec(), &zero);
425     turn_into_assign (op, cind, why);
426 }
427 
428 
429 
430 // Turn the current op into a simple assignment to one (of the first arg).
431 void
turn_into_assign_one(Opcode & op,string_view why)432 RuntimeOptimizer::turn_into_assign_one (Opcode &op, string_view why)
433 {
434     Symbol &R (*(inst()->argsymbol(op.firstarg()+0)));
435     if (R.typespec().is_int()) {
436         int one = 1;
437         int cind = add_constant (R.typespec(), &one);
438         turn_into_assign (op, cind, why);
439     } else {
440         OSL_DASSERT (R.typespec().is_triple() || R.typespec().is_float());
441         static float one[3] = { 1, 1, 1 };
442         int cind = add_constant (R.typespec(), &one);
443         turn_into_assign (op, cind, why);
444     }
445 }
446 
447 
448 
449 // Turn the op into a no-op
450 int
turn_into_nop(Opcode & op,string_view why)451 RuntimeOptimizer::turn_into_nop (Opcode &op, string_view why)
452 {
453     if (op.opname() != u_nop) {
454         if (debug() > 1)
455             debug_turn_into (op, 1, "nop", -1, -1, -1, why);
456         op.reset (u_nop, 0);
457         return 1;
458     }
459     return 0;
460 }
461 
462 
463 
464 int
turn_into_nop(int begin,int end,string_view why)465 RuntimeOptimizer::turn_into_nop (int begin, int end, string_view why)
466 {
467     int changed = 0;
468     for (int i = begin;  i < end;  ++i) {
469         Opcode &op (inst()->ops()[i]);
470         if (op.opname() != u_nop) {
471             op.reset (u_nop, 0);
472             ++changed;
473         }
474     }
475     if (debug() > 1 && changed)
476         debug_turn_into (inst()->ops()[begin], end-begin, "nop", -1, -1, -1, why);
477     return changed;
478 }
479 
480 // Turn the op into a no-op functioncall
481 // We keep want to keep the jumps indices so we can correctly
482 // model an inlined function call for the debugger
483 int
turn_into_functioncall_nr(Opcode & op,string_view why)484 RuntimeOptimizer::turn_into_functioncall_nr (Opcode &op, string_view why)
485 {
486     if (op.opname() == u_functioncall) {
487         if (debug() > 1)
488             debug_turn_into (op, 1, "functioncall_nr", -1, -1, -1, why);
489         op.transmute_opname (u_functioncall_nr);
490         return 1;
491     }
492     return 0;
493 }
494 
495 
496 void
insert_code(int opnum,ustring opname,const cspan<int> args_to_add,RecomputeRWRangesOption recompute_rw_ranges,InsertRelation relation)497 RuntimeOptimizer::insert_code (int opnum, ustring opname,
498                                const cspan<int> args_to_add,
499                                RecomputeRWRangesOption recompute_rw_ranges,
500                                InsertRelation relation)
501 {
502     OpcodeVec &code (inst()->ops());
503     std::vector<int> &opargs (inst()->args());
504     ustring method = (opnum < (int)code.size()) ? code[opnum].method() : OSLCompilerImpl::main_method_name();
505     int nargs = args_to_add.size();
506     Opcode op (opname, method, opargs.size(), nargs);
507     code.insert (code.begin()+opnum, op);
508     opargs.insert (opargs.end(), args_to_add.begin(), args_to_add.end());
509     if (opnum < inst()->m_maincodebegin)
510         ++inst()->m_maincodebegin;
511     ++inst()->m_maincodeend;
512     if ((relation == -1 && opnum > 0) ||
513         (relation == 1 && opnum < (int)code.size()-1)) {
514         code[opnum].method (code[opnum+relation].method());
515         code[opnum].source (code[opnum+relation].sourcefile(),
516                             code[opnum+relation].sourceline());
517     }
518 
519     // Unless we were inserting at the end, we may need to adjust
520     // the jump addresses of other ops and the param init ranges.
521     if (opnum < (int)code.size()-1) {
522         // Adjust jump offsets
523         for (auto& c : code) {
524             for (int j = 0; j < (int)Opcode::max_jumps && c.jump(j) >= 0; ++j) {
525                 if (c.jump(j) > opnum) {
526                     c.jump(j) = c.jump(j) + 1;
527                     // std::cerr << "Adjusting jump target at op " << n << "\n";
528                 }
529             }
530         }
531         // Adjust param init ranges
532         FOREACH_PARAM (auto&& s, inst()) {
533             if (s.initbegin() > opnum)
534                 s.initbegin (s.initbegin()+1);
535             if (s.initend() > opnum)
536                 s.initend (s.initend()+1);
537         }
538     }
539 
540     // Inserting the instruction may change the read/write ranges of
541     // symbols.  Not adjusting this can throw off other optimizations.
542     if (recompute_rw_ranges) {
543         for (auto&& s : inst()->symbols()) {
544             if (s.everread()) {
545                 int first = s.firstread(), last = s.lastread();
546                 if (first >= opnum)
547                     ++first;
548                 if (last >= opnum)
549                     ++last;
550                 s.set_read (first, last);
551             }
552             if (s.everwritten()) {
553                 int first = s.firstwrite(), last = s.lastwrite();
554                 if (first >= opnum)
555                     ++first;
556                 if (last >= opnum)
557                     ++last;
558                 s.set_write (first, last);
559             }
560         }
561     }
562 
563     // Adjust the basic block IDs and which instructions are inside
564     // conditionals.
565     if (m_bblockids.size()) {
566         OSL_DASSERT (m_bblockids.size() == code.size()-1);
567         m_bblockids.insert (m_bblockids.begin()+opnum, 1, m_bblockids[opnum]);
568     }
569     if (m_in_conditional.size()) {
570         OSL_DASSERT (m_in_conditional.size() == code.size()-1);
571         m_in_conditional.insert (m_in_conditional.begin()+opnum, 1,
572                                  m_in_conditional[opnum]);
573     }
574     if (m_in_loop.size()) {
575         OSL_DASSERT (m_in_loop.size() == code.size()-1);
576         m_in_loop.insert (m_in_loop.begin()+opnum, 1,
577                           m_in_loop[opnum]);
578     }
579     // If the first return happened after this, bump it up
580     if (m_first_return >= opnum)
581         ++m_first_return;
582 
583     if (opname == u_if) {
584         // special case for 'if' -- the arg is read, not written
585         inst()->symbol(args_to_add[0])->mark_rw (opnum, true, false);
586     }
587     else if (opname != u_useparam) {
588         // Mark the args as being used for this op (assume that the
589         // first is written, the others are read).
590         for (int a = 0;  a < nargs;  ++a)
591             inst()->symbol(args_to_add[a])->mark_rw (opnum, a>0, a==0);
592     }
593 }
594 
595 void
insert_code(int opnum,ustring opname,InsertRelation relation,int arg0,int arg1,int arg2,int arg3)596 RuntimeOptimizer::insert_code (int opnum, ustring opname,
597                                InsertRelation relation,
598                                int arg0, int arg1, int arg2, int arg3)
599 {
600     int args[4];
601     int nargs = 0;
602     if (arg0 >= 0) args[nargs++] = arg0;
603     if (arg1 >= 0) args[nargs++] = arg1;
604     if (arg2 >= 0) args[nargs++] = arg2;
605     if (arg3 >= 0) args[nargs++] = arg3;
606     insert_code (opnum, opname, cspan<int>(args, args + nargs), RecomputeRWRanges, relation);
607 }
608 
609 
610 
611 /// Insert a 'useparam' instruction in front of instruction 'opnum', to
612 /// reference the symbols in 'params'.
613 void
insert_useparam(size_t opnum,const std::vector<int> & params_to_use)614 RuntimeOptimizer::insert_useparam (size_t opnum,
615                                    const std::vector<int> &params_to_use)
616 {
617     OSL_DASSERT (params_to_use.size() > 0);
618     OpcodeVec &code (inst()->ops());
619     insert_code (opnum, u_useparam, params_to_use,
620                  RecomputeRWRanges, GroupWithNext);
621 
622     // All ops are "read"
623     code[opnum].argwrite (0, false);
624     code[opnum].argread (0, true);
625     if (opnum < code.size()-1) {
626         // We have no parse node, but we set the new instruction's
627         // "source" to the one of the statement right after.
628         code[opnum].source (code[opnum+1].sourcefile(),
629                             code[opnum+1].sourceline());
630         // Set the method id to the same as the statement right after
631         code[opnum].method (code[opnum+1].method());
632     } else {
633         // If there IS no "next" instruction, just call it main
634         code[opnum].method (OSLCompilerImpl::main_method_name());
635     }
636 }
637 
638 
639 
640 void
add_useparam(SymbolPtrVec & allsyms)641 RuntimeOptimizer::add_useparam (SymbolPtrVec &allsyms)
642 {
643     OpcodeVec &code (inst()->ops());
644     std::vector<int> &opargs (inst()->args());
645 
646     // Mark all symbols as un-initialized
647     for (auto&& s : inst()->symbols())
648         s.initialized (false);
649 
650     if (inst()->m_maincodebegin < 0)
651         inst()->m_maincodebegin = (int)code.size();
652 
653     // Take care of the output params right off the bat -- as soon as the
654     // shader starts running 'main'.
655     std::vector<int> outputparams;
656     for (int i = 0;  i < (int)inst()->symbols().size();  ++i) {
657         Symbol *s = inst()->symbol(i);
658         if (s->symtype() == SymTypeOutputParam &&
659             (s->connected() || s->connected_down() || s->renderer_output() ||
660              (s->valuesource() == Symbol::DefaultVal && s->has_init_ops()))) {
661             outputparams.push_back (i);
662             s->initialized (true);
663         }
664     }
665     if (outputparams.size())
666         insert_useparam (inst()->m_maincodebegin, outputparams);
667 
668     // Figure out which statements are inside conditional states
669     find_conditionals ();
670 
671     // Loop over all ops...
672     for (int opnum = 0;  opnum < (int)code.size();  ++opnum) {
673         Opcode &op (code[opnum]);  // handy ref to the op
674         if (op.opname() == u_useparam)
675             continue;  // skip useparam ops themselves, if we hit one
676         bool simple_assign = is_simple_assign(op);
677         bool in_main_code = (opnum >= inst()->m_maincodebegin);
678         std::vector<int> params;   // list of params referenced by this op
679         // For each argument...
680         for (int a = 0;  a < op.nargs();  ++a) {
681             int argind = op.firstarg() + a;
682             SymbolPtr s = inst()->argsymbol (argind);
683             OSL_DASSERT(s->dealias() == s);
684             // If this arg is a param and is read, remember it
685             if (s->symtype() != SymTypeParam && s->symtype() != SymTypeOutputParam)
686                 continue;  // skip non-params
687             // skip if we've already 'usedparam'ed it unconditionally
688             if (s->initialized() && in_main_code)
689                 continue;
690 
691             bool inside_init = (opnum >= s->initbegin() && opnum < s->initend());
692             if (op.argread(a) || (op.argwrite(a) && !inside_init)) {
693                 // Don't add it more than once
694                 if (std::find (params.begin(), params.end(), opargs[argind]) == params.end()) {
695                     // If this arg is the one being written to by a
696                     // "simple" assignment, it doesn't need a useparam here.
697                     if (! (simple_assign && a == 0))
698                         params.push_back (opargs[argind]);
699                     // mark as already initialized unconditionally, if we do
700                     if (op_is_unconditionally_executed(opnum) &&
701                             op.method() == OSLCompilerImpl::main_method_name())
702                         s->initialized (true);
703                 }
704             }
705         }
706 
707         // If the arg we are examining read any params, insert a "useparam"
708         // op whose arguments are the list of params we are about to use.
709         if (params.size()) {
710             insert_useparam (opnum, params);
711             // Skip the op we just added
712             ++opnum;
713         }
714     }
715 
716     // Mark all symbols as un-initialized
717     for (auto&& s : inst()->symbols())
718         s.initialized (false);
719 
720     // Re-track variable lifetimes, since the inserted useparam
721     // instructions will have change the instruction numbers.
722     find_basic_blocks ();
723     track_variable_lifetimes (allsyms);
724 }
725 
726 
727 
728 bool
is_zero(const Symbol & A)729 OSOProcessorBase::is_zero (const Symbol &A)
730 {
731     if (! A.is_constant())
732         return false;
733     const TypeSpec &Atype (A.typespec());
734     static Vec3 Vzero (0, 0, 0);
735     return (Atype.is_float() && *(const float *)A.data() == 0) ||
736         (Atype.is_int() && *(const int *)A.data() == 0) ||
737         (Atype.is_triple() && *(const Vec3 *)A.data() == Vzero);
738 }
739 
740 
741 
742 bool
is_one(const Symbol & A)743 OSOProcessorBase::is_one (const Symbol &A)
744 {
745     if (! A.is_constant())
746         return false;
747     const TypeSpec &Atype (A.typespec());
748     static Vec3 Vone (1, 1, 1);
749     static Matrix44 Mone (1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1);
750     return (Atype.is_float() && *(const float *)A.data() == 1) ||
751         (Atype.is_int() && *(const int *)A.data() == 1) ||
752         (Atype.is_triple() && *(const Vec3 *)A.data() == Vone) ||
753         (Atype.is_matrix() && *(const Matrix44 *)A.data() == Mone);
754 }
755 
756 
757 
758 bool
is_nonzero(const Symbol & A)759 OSOProcessorBase::is_nonzero (const Symbol &A)
760 {
761     if (! A.is_constant())
762         return false;
763     const TypeSpec &Atype (A.typespec());
764     int ncomponents = Atype.numelements() * Atype.aggregate();
765     if (Atype.is_float_based()) {
766         const float *val = (const float *)A.data();
767         for (int i = 0; i < ncomponents; ++i)
768             if (val[i] == 0.0f)
769                 return false;
770         return true;
771     }
772     if (Atype.is_int_based()) {
773         const int *val = (const int *)A.data();
774         for (int i = 0; i < ncomponents; ++i)
775             if (val[i] == 0)
776                 return false;
777         return true;
778     }
779     return false;
780 }
781 
782 
783 
784 std::string
const_value_as_string(const Symbol & A)785 OSOProcessorBase::const_value_as_string (const Symbol &A)
786 {
787     if (! A.is_constant())
788         return std::string();
789     TypeDesc type (A.typespec().simpletype());
790     int n = type.numelements() * type.aggregate;
791     std::ostringstream s;
792     s.imbue (std::locale::classic());  // force C locale
793     if (type.basetype == TypeDesc::FLOAT) {
794         for (int i = 0; i < n; ++i)
795             s << (i ? "," : "") << ((const float *)A.data())[i];
796     } else if (type.basetype == TypeDesc::INT) {
797         for (int i = 0; i < n; ++i)
798             s << (i ? "," : "") << ((const int *)A.data())[i];
799     } else if (type.basetype == TypeDesc::STRING) {
800         for (int i = 0; i < n; ++i)
801             s << (i ? "," : "") << '\"' << ((const ustring *)A.data())[i] << '\"';
802     }
803     return s.str();
804 }
805 
806 
807 
808 void
register_message(ustring name)809 RuntimeOptimizer::register_message (ustring name)
810 {
811     m_local_messages_sent.push_back (name);
812 }
813 
814 
815 
816 void
register_unknown_message()817 RuntimeOptimizer::register_unknown_message ()
818 {
819     m_local_unknown_message_sent = true;
820 }
821 
822 
823 
824 bool
message_possibly_set(ustring name) const825 RuntimeOptimizer::message_possibly_set (ustring name) const
826 {
827     return m_local_unknown_message_sent || m_unknown_message_sent ||
828         std::find (m_messages_sent.begin(), m_messages_sent.end(), name) != m_messages_sent.end() ||
829         std::find (m_local_messages_sent.begin(), m_local_messages_sent.end(), name) != m_local_messages_sent.end();
830 }
831 
832 
833 
834 /// For all the instance's parameters (that can't be overridden by the
835 /// geometry), if they can be found to be effectively constants or
836 /// globals, make constants for them and alias them to the constant. If
837 /// they are connected to an earlier layer's output, if it can determine
838 /// that the output will be a constant or global, then sever the
839 /// connection and just alias our parameter to that value.
840 void
simplify_params()841 RuntimeOptimizer::simplify_params ()
842 {
843     for (int i = inst()->firstparam();  i < inst()->lastparam();  ++i) {
844         Symbol *s (inst()->symbol(i));
845         if (s->symtype() != SymTypeParam)
846             continue;  // Skip non-params
847         if (! s->lockgeom())
848             continue;  // Don't mess with params that can change with the geom
849         if (s->typespec().is_structure() || s->typespec().is_closure_based())
850             continue;  // We don't mess with struct placeholders or closures
851 
852         if (s->valuesource() == Symbol::InstanceVal) {
853             // Instance value -- turn it into a constant and remove init ops
854             make_symbol_room (1);
855             s = inst()->symbol(i);  // In case make_symbol_room changed ptrs
856             int cind = add_constant (s->typespec(), s->data());
857             global_alias (i, cind); // Alias this symbol to the new const
858             turn_into_nop (s->initbegin(), s->initend(),
859                            "instance value doesn't need init ops");
860         } else if (s->valuesource() == Symbol::DefaultVal && !s->has_init_ops()) {
861             // Plain default value without init ops -- turn it into a constant
862             make_symbol_room (1);
863             s = inst()->symbol(i);  // In case make_symbol_room changed ptrs
864             int cind = add_constant (s->typespec(), s->data(), s->typespec().simpletype());
865             global_alias (i, cind); // Alias this symbol to the new const
866         } else if (s->valuesource() == Symbol::DefaultVal && s->has_init_ops()) {
867             // Default val comes from init ops -- special cases?  Yes,
868             // if it's a simple assignment from a global whose value is
869             // not reassigned later, we can just alias it, and if we're
870             // lucky that may eliminate all uses of the parameter.
871 
872             // First, trim init ops in case nops have accumulated
873             while (s->has_init_ops() && op(s->initbegin()).opname() == u_nop)
874                 s->initbegin (s->initbegin()+1);
875             while (s->has_init_ops() && op(s->initend()-1).opname() == u_nop)
876                 s->initend (s->initend()-1);
877             if (s->initbegin() == s->initend()-1) {  // just one op
878                 Opcode &op (inst()->ops()[s->initbegin()]);
879                 if (op.opname() == u_assign) {
880                     // The default value has init ops, but they consist of
881                     // just a single assignment op...
882                     Symbol *src = inst()->argsymbol(op.firstarg()+1);
883                     // Is it assigning a global, or a parameter that's
884                     // got a default or instance value and isn't on the geom,
885                     // and its value is never changed and the types match?
886                     if ((src->symtype() == SymTypeGlobal ||
887                          src->symtype() == SymTypeConst ||
888                          (src->symtype() == SymTypeParam && src->lockgeom() &&
889                           (src->valuesource() == Symbol::DefaultVal ||
890                            src->valuesource() == Symbol::InstanceVal)))
891                         && !src->everwritten()
892                         && equivalent(src->typespec(), s->typespec())) {
893                         // Great, so let's remember the alias.  We can't
894                         // call global_alias() here, because we're still in
895                         // init ops, that'll screw us up.  So we just record
896                         // it in m_param_aliases and then we'll establish
897                         // the global aliases when we hit the main code.
898                         m_param_aliases[i] = inst()->arg(op.firstarg()+1);
899                     }
900                 }
901             }
902         } else if (s->valuesource() == Symbol::ConnectedVal) {
903             // It's connected to an earlier layer.  If the output var of
904             // the upstream shader is effectively constant or a global,
905             // then so is this variable.
906             for (auto&& c : inst()->connections()) {
907                 if (c.dst.param != i)
908                     continue;
909                 if (c.dst.is_complete()) {
910                     /// All components are being set through either
911                     /// float->triple or triple->triple
912                     /// Get rid of the un-needed init ops.
913                     turn_into_nop (s->initbegin(), s->initend(),
914                                    "connected value doesn't need init ops");
915                 }
916                 if (c.is_complete()) {
917                     // srcsym is the earlier group's output param, which
918                     // is fully connected as the input to the param we're
919                     // examining.
920                     ShaderInstance *uplayer = group()[c.srclayer];
921                     Symbol *srcsym = uplayer->symbol(c.src.param);
922                     if (!srcsym->lockgeom())
923                         continue; // Not if it can be overridden by geometry
924 
925                     // Is the source symbol known to be a global, from
926                     // earlier analysis by find_params_holding_globals?
927                     // If so, make sure the global is in this instance's
928                     // symbol table, and alias the parameter to it.
929                     ustringmap_t &g (m_params_holding_globals[c.srclayer]);
930                     auto f = g.find (srcsym->name());
931                     if (f != g.end()) {
932                         if (debug() > 1)
933                             debug_optf("Remapping %s.%s because it's connected to "
934                                        "%s.%s, which is known to be %s\n",
935                                        inst()->layername(), s->name(),
936                                        uplayer->layername(), srcsym->name(),
937                                        f->second);
938                         make_symbol_room (1);
939                         s = inst()->symbol(i);  // In case make_symbol_room changed ptrs
940                         int ind = add_global (f->second, srcsym->typespec());
941                         global_alias (i, ind);
942                         shadingsys().m_stat_global_connections += 1;
943                         break;
944                     }
945 
946                     if (!srcsym->everwritten() &&
947                         (srcsym->valuesource() == Symbol::DefaultVal ||
948                          srcsym->valuesource() == Symbol::InstanceVal) &&
949                         !srcsym->has_init_ops()) {
950                         make_symbol_room (1);
951                         s = inst()->symbol(i);  // In case make_symbol_room changed ptrs
952                         int cind = add_constant (s->typespec(), srcsym->data(),
953                                                  srcsym->typespec().simpletype());
954                         // Alias this symbol to the new const
955                         global_alias (i, cind);
956                         make_param_use_instanceval (s, "- upstream layer sets it to a constant");
957                         replace_param_value (s, srcsym->data(), srcsym->typespec());
958                         shadingsys().m_stat_const_connections += 1;
959                         break;
960                     }
961                 }
962             }
963             // FIXME / N.B.: We only optimize "fully complete" connections,
964             // not those involving individual components or array elements
965             // of the connected parameters, because we sure don't track the
966             // constness or aliasing of individual components/element, only
967             // whole variables. But there are two cases where the logic
968             // above fails to fully exploit the connection propagating a
969             // constant value. (a) Partial-to-whole connections, for example
970             // connecting one component of an upstream triple output to a
971             // downstream float input, should propagate the constant, but we
972             // currently neglect this case. (b) If *multiple* connections
973             // combine to fully propagate values, for example if someone was
974             // foolish enough to connect R, G, and B components of color
975             // parameters *separately*, we sure don't notice that and treat
976             // it as a full connection of the color.
977         }
978     }
979 }
980 
981 
982 
983 /// For all the instance's parameters, if they are simply assigned globals,
984 /// record that in m_params_holding_globals.
985 void
find_params_holding_globals()986 RuntimeOptimizer::find_params_holding_globals ()
987 {
988     FOREACH_PARAM (auto&& s, inst()) {
989         // Skip if this isn't a shader output parameter that's connected
990         // to a later layer.
991         if (s.symtype() != SymTypeParam && s.symtype() != SymTypeOutputParam)
992             continue;  // Skip non-params
993         if (!s.connected_down())
994             continue;  // Skip unconnected params -- who cares
995         if (s.valuesource() != Symbol::DefaultVal)
996             continue;  // Skip -- must be connected or an instance value
997         if (s.firstwrite() < 0 || s.firstwrite() != s.lastwrite())
998             continue;  // Skip -- written more than once
999 
1000         int opnum = s.firstwrite();
1001         Opcode &op (inst()->ops()[opnum]);
1002         if (op.opname() != u_assign || ! op_is_unconditionally_executed(opnum))
1003             continue;   // Not a simple assignment unconditionally performed
1004 
1005         // what s is assigned from (fully dealiased)
1006         Symbol *src = inst()->symbol (dealias_symbol (oparg (op, 1), opnum));
1007 
1008         if (src->symtype() != SymTypeGlobal)
1009             continue;   // only interested in global assignments
1010 
1011         if (debug() > 1)
1012             debug_optf("I think that %s.%s will always be %s\n",
1013                        inst()->layername(), s.name(), src->name());
1014         m_params_holding_globals[layer()][s.name()] = src->name();
1015     }
1016 }
1017 
1018 
1019 
1020 void
find_conditionals()1021 OSOProcessorBase::find_conditionals ()
1022 {
1023     OpcodeVec &code (inst()->ops());
1024 
1025     m_in_conditional.clear ();
1026     m_in_conditional.resize (code.size(), false);
1027     m_in_loop.clear ();
1028     m_in_loop.resize (code.size(), false);
1029     m_first_return = (int)code.size();
1030     for (int i = 0;  i < (int)code.size();  ++i) {
1031         if (code[i].jump(0) >= 0) {
1032             std::fill (m_in_conditional.begin()+i,
1033                        m_in_conditional.begin()+code[i].farthest_jump(), true);
1034             if (code[i].opname() == Strings::op_dowhile ||
1035                   code[i].opname() == Strings::op_for ||
1036                   code[i].opname() == Strings::op_while) {
1037                 std::fill (m_in_loop.begin()+i,
1038                            m_in_loop.begin()+code[i].farthest_jump(), true);
1039             }
1040         }
1041         if (code[i].opname() == Strings::op_exit)
1042             m_first_return = std::min (m_first_return, i);
1043     }
1044 }
1045 
1046 
1047 
1048 void
find_basic_blocks()1049 OSOProcessorBase::find_basic_blocks ()
1050 {
1051     OpcodeVec &code (inst()->ops());
1052 
1053     // Start by setting all basic block IDs to 0
1054     m_bblockids.clear ();
1055     m_bblockids.resize (code.size(), 0);
1056 
1057     // First, keep track of all the spots where blocks begin
1058     std::vector<bool> block_begin (code.size(), false);
1059 
1060     // Init ops start basic blocks
1061     FOREACH_PARAM (const Symbol &s, inst()) {
1062         if (s.has_init_ops())
1063             block_begin[s.initbegin()] = true;
1064     }
1065 
1066     // Main code starts a basic block
1067     block_begin[inst()->maincodebegin()] = true;
1068 
1069     for (size_t opnum = 0;  opnum < code.size();  ++opnum) {
1070         Opcode &op (code[opnum]);
1071         if (op.opname() == u_functioncall_nr)
1072         {   // Treat the 'no return' function call as if it were a nop.
1073             // we use later to generate correct inline debug information.
1074             continue;
1075         }
1076         // Anyplace that's the target of a jump instruction starts a basic block
1077         for (int j = 0;  j < (int)Opcode::max_jumps;  ++j) {
1078             if (op.jump(j) >= 0)
1079                 block_begin[op.jump(j)] = true;
1080             else
1081                 break;
1082         }
1083         // The first instruction in a conditional or loop (which is not
1084         // itself a jump target) also begins a basic block.  If the op has
1085         // any jump targets at all, it must be a conditional or loop.
1086         if (op.jump(0) >= 0)
1087             block_begin[opnum+1] = true;
1088         // 'break', 'continue', 'return', and 'exit' also cause the next
1089         // statement to begin a new basic block.
1090         if (op.opname() == u_break || op.opname() == u_continue ||
1091             op.opname() == u_return || op.opname() == u_exit)
1092             block_begin[opnum+1] = true;
1093     }
1094 
1095     // Now color the blocks with unique identifiers
1096     int bbid = 1;  // next basic block ID to use
1097     for (size_t opnum = 0;  opnum < code.size();  ++opnum) {
1098         if (block_begin[opnum])
1099             ++bbid;
1100         m_bblockids[opnum] = bbid;
1101     }
1102 }
1103 
1104 
1105 
1106 /// For 'R = A_const' where R and A are different, but coerceable,
1107 /// types, turn it into a constant assignment of the exact type.
1108 /// Return true if a change was made, otherwise return false.
1109 bool
coerce_assigned_constant(Opcode & op)1110 RuntimeOptimizer::coerce_assigned_constant (Opcode &op)
1111 {
1112     OSL_DASSERT (op.opname() == u_assign);
1113     Symbol *R (inst()->argsymbol(op.firstarg()+0));
1114     Symbol *A (inst()->argsymbol(op.firstarg()+1));
1115 
1116     if (! A->is_constant() || R->typespec().is_closure_based())
1117         return false;   // we don't handle those cases
1118 
1119     // turn 'R_float = A_int_const' into a float const assignment
1120     if (A->typespec().is_int() && R->typespec().is_float()) {
1121         float result = *(int *)A->data();
1122         int cind = add_constant (R->typespec(), &result);
1123         turn_into_assign (op, cind, "coerce to correct type");
1124         return true;
1125     }
1126 
1127     // turn 'R_int = A_float_const' into an int const assignment
1128     if (A->typespec().is_float() && R->typespec().is_int()) {
1129         int result = (int) *(float *)A->data();
1130         int cind = add_constant (R->typespec(), &result);
1131         turn_into_assign (op, cind, "coerce to correct type");
1132         return true;
1133     }
1134 
1135     // turn 'R_triple = A_int_const' into a float const assignment
1136     if (A->typespec().is_int() && R->typespec().is_triple()) {
1137         float f = *(int *)A->data();
1138         Vec3 result (f, f, f);
1139         int cind = add_constant (R->typespec(), &result);
1140         turn_into_assign (op, cind, "coerce to correct type");
1141         return true;
1142     }
1143 
1144     // turn 'R_triple = A_float_const' into a triple const assignment
1145     if (A->typespec().is_float() && R->typespec().is_triple()) {
1146         float f = *(float *)A->data();
1147         Vec3 result (f, f, f);
1148         int cind = add_constant (R->typespec(), &result);
1149         turn_into_assign (op, cind, "coerce to correct type");
1150         return true;
1151     }
1152 
1153     // Turn 'R_triple = A_other_triple_constant' into a triple const assign
1154     if (A->typespec().is_triple() && R->typespec().is_triple() &&
1155         A->typespec() != R->typespec()) {
1156         Vec3 *f = (Vec3 *)A->data();
1157         int cind = add_constant (R->typespec(), f);
1158         turn_into_assign (op, cind, "coerce to correct type");
1159         return true;
1160     }
1161 
1162     // turn 'R_matrix = A_float_const' into a matrix const assignment
1163     if (A->typespec().is_float() && R->typespec().is_matrix()) {
1164         float f = *(float *)A->data();
1165         Matrix44 result (f, 0, 0, 0, 0, f, 0, 0, 0, 0, f, 0, 0, 0, 0, f);
1166         int cind = add_constant (R->typespec(), &result);
1167         turn_into_assign (op, cind, "coerce to correct type");
1168         return true;
1169     }
1170     // turn 'R_matrix = A_int_const' into a matrix const assignment
1171     if (A->typespec().is_int() && R->typespec().is_matrix()) {
1172         float f = *(int *)A->data();
1173         Matrix44 result (f, 0, 0, 0, 0, f, 0, 0, 0, 0, f, 0, 0, 0, 0, f);
1174         int cind = add_constant (R->typespec(), &result);
1175         turn_into_assign (op, cind, "coerce to correct type");
1176         return true;
1177     }
1178 
1179     return false;
1180 }
1181 
1182 
1183 
1184 void
clear_stale_syms()1185 RuntimeOptimizer::clear_stale_syms ()
1186 {
1187     m_stale_syms.clear ();
1188 }
1189 
1190 
1191 
1192 void
use_stale_sym(int sym)1193 RuntimeOptimizer::use_stale_sym (int sym)
1194 {
1195     FastIntMap::iterator i = m_stale_syms.find(sym);
1196     if (i != m_stale_syms.end())
1197         m_stale_syms.erase (i);
1198 }
1199 
1200 
1201 
1202 bool
is_simple_assign(Opcode & op,const OpDescriptor * opd)1203 RuntimeOptimizer::is_simple_assign (Opcode &op, const OpDescriptor *opd)
1204 {
1205     // Simple only if arg0 is the only write, and is write only.
1206     if (op.argwrite_bits() != 1 || op.argread(0))
1207         return false;
1208     if (! opd)
1209         opd = shadingsys().op_descriptor (op.opname());
1210     if (!opd || !opd->simple_assign)
1211         return false;   // reject all other known non-simple assignments
1212     // Make sure the result isn't also read
1213     int result = oparg(op,0);
1214     for (int i = 1, e = op.nargs();  i < e;  ++i)
1215         if (oparg(op,i) == result)
1216             return false;
1217     return true;
1218 }
1219 
1220 
1221 
1222 void
simple_sym_assign(int sym,int opnum)1223 RuntimeOptimizer::simple_sym_assign (int sym, int opnum)
1224 {
1225     if (optimize() >= 2 && m_opt_stale_assign) {
1226         FastIntMap::iterator i = m_stale_syms.find(sym);
1227         if (i != m_stale_syms.end()) {
1228             Opcode &uselessop (inst()->ops()[i->second]);
1229             if (uselessop.opname() != u_nop && uselessop.opname() != u_functioncall_nr)
1230                 turn_into_nop (uselessop,
1231                            debug() > 1 ? Strutil::sprintf("remove stale value assignment to %s, reassigned on op %d",
1232                                                          opargsym(uselessop,0)->name(), opnum).c_str() : "");
1233         }
1234     }
1235     m_stale_syms[sym] = opnum;
1236 }
1237 
1238 
1239 
1240 bool
unread_after(const Symbol * A,int opnum)1241 RuntimeOptimizer::unread_after (const Symbol *A, int opnum)
1242 {
1243     // Try to figure out if this symbol is completely unused after this
1244     // op (and thus, any values written to it now will never be needed).
1245 
1246     // Globals may be read by later layers
1247     if (A->symtype() == SymTypeGlobal)
1248         return false;
1249 
1250     // Params may be read afterwards if connected to a downstream
1251     // layer or if "elide_unconnected_outputs" is turned off.
1252     if (A->symtype() == SymTypeOutputParam || A->symtype() == SymTypeParam) {
1253         if (! m_opt_elide_unconnected_outputs)
1254             return false;   // Asked not do do this optimization
1255         if (A->connected_down())
1256             return false;   // Connected to something downstream
1257         if (A->renderer_output())
1258             return false;   // This is a renderer output -- don't cull it
1259     }
1260 
1261     // For all else, check if it's either never read at all in this
1262     // layer or it's only read earlier and we're not part of a loop
1263     return !A->everread() || (A->lastread() <= opnum && !m_in_loop[opnum]);
1264 }
1265 
1266 
1267 
1268 void
replace_param_value(Symbol * R,const void * newdata,const TypeSpec & newdata_type)1269 RuntimeOptimizer::replace_param_value (Symbol *R, const void *newdata,
1270                                        const TypeSpec &newdata_type)
1271 {
1272     OSL_DASSERT (R->symtype() == SymTypeParam || R->symtype() == SymTypeOutputParam);
1273     TypeDesc Rtype = R->typespec().simpletype();
1274     OSL_DASSERT(R->dataoffset() >= 0);
1275     int Rnvals = int(Rtype.aggregate * Rtype.numelements());
1276     TypeDesc Ntype = newdata_type.simpletype();
1277     if (Ntype == TypeDesc::UNKNOWN)
1278         Ntype = Rtype;
1279     int Nnvals = int(Ntype.aggregate * Ntype.numelements());
1280     if (Rtype.basetype == TypeDesc::FLOAT &&
1281           Ntype.basetype == TypeDesc::FLOAT) {
1282         float *Rdefault = &inst()->m_fparams[R->dataoffset()];
1283         OSL_DASSERT((R->dataoffset()+Rnvals) <= (int)inst()->m_fparams.size());
1284         if (Rnvals == Nnvals)   // straight copy
1285             for (int i = 0;  i < Rnvals;  ++i)
1286                 Rdefault[i] = ((const float *)newdata)[i];
1287         else if (Nnvals == 1)  // scalar -> aggregate, by replication
1288             for (int i = 0;  i < Rnvals;  ++i)
1289                 Rdefault[i] = ((const float *)newdata)[0];
1290         else {
1291             OSL_ASSERT (0 && "replace_param_value: unexpected types");
1292         }
1293     }
1294     else if (Rtype.basetype == TypeDesc::FLOAT &&
1295              Ntype.basetype == TypeDesc::INT) {
1296         // Careful, this is an int-to-float conversion
1297         float *Rdefault = &inst()->m_fparams[R->dataoffset()];
1298         OSL_DASSERT((R->dataoffset()+Rnvals) <= (int)inst()->m_fparams.size());
1299         if (Rnvals == Nnvals)   // straight copy
1300             for (int i = 0;  i < Rnvals;  ++i)
1301                 Rdefault[i] = ((const int *)newdata)[i];
1302         else if (Nnvals == 1)  // scalar -> aggregate, by replication
1303             for (int i = 0;  i < Rnvals;  ++i)
1304                 Rdefault[i] = ((const int *)newdata)[0];
1305         else {
1306             OSL_ASSERT (0 && "replace_param_value: unexpected types");
1307         }
1308     }
1309     else if (Rtype.basetype == TypeDesc::INT &&
1310              Ntype.basetype == TypeDesc::INT && Rnvals == Nnvals) {
1311         int *Rdefault = &inst()->m_iparams[R->dataoffset()];
1312         OSL_DASSERT((R->dataoffset()+Rnvals) <= (int)inst()->m_iparams.size());
1313         for (int i = 0;  i < Rnvals;  ++i)
1314             Rdefault[i] = ((const int *)newdata)[i];
1315     }
1316     else if (Rtype.basetype == TypeDesc::STRING &&
1317              Ntype.basetype == TypeDesc::STRING && Rnvals == Nnvals) {
1318         ustring *Rdefault = &inst()->m_sparams[R->dataoffset()];
1319         OSL_DASSERT((R->dataoffset()+Rnvals) <= (int)inst()->m_sparams.size());
1320         for (int i = 0;  i < Rnvals;  ++i)
1321             Rdefault[i] = ((const ustring *)newdata)[i];
1322     } else {
1323         OSL_ASSERT (0 && "replace_param_value: unexpected types");
1324     }
1325 }
1326 
1327 
1328 
1329 // Predicate to test if the connection's destination is never used
1330 struct ConnectionDestIs
1331 {
ConnectionDestIspvt::ConnectionDestIs1332     ConnectionDestIs (const ShaderInstance &inst, const Symbol *sym)
1333         : m_inst(inst), m_sym(sym) { }
operator ()pvt::ConnectionDestIs1334     bool operator() (const Connection &c) {
1335         return m_inst.symbol(c.dst.param) == m_sym;
1336     }
1337 private:
1338     const ShaderInstance &m_inst;
1339     const Symbol *m_sym;
1340 };
1341 
1342 
1343 
1344 /// Symbol R in the current instance has a connection or init ops we
1345 /// no longer need; turn it into a a plain old instance-value
1346 /// parameter.
1347 void
make_param_use_instanceval(Symbol * R,string_view why)1348 RuntimeOptimizer::make_param_use_instanceval (Symbol *R, string_view why)
1349 {
1350     if (debug() > 1)
1351         std::cout << "Turning " << R->valuesourcename() << ' '
1352                   << R->typespec() << ' ' << R->name()
1353                   << " into an instance value "
1354                   << why << "\n";
1355 
1356     // Mark its source as the instance value, not connected
1357     R->valuesource (Symbol::InstanceVal);
1358     // If it isn't a connection or computed, it doesn't need derivs.
1359     R->has_derivs (false);
1360 
1361     // Point the symbol's data pointer to its instance value
1362     // uniform
1363     void *Rdefault = NULL;
1364     OSL_DASSERT(R->dataoffset() >= 0);
1365     TypeDesc Rtype = R->typespec().simpletype();
1366     if (Rtype.basetype == TypeDesc::FLOAT)
1367         Rdefault = &inst()->m_fparams[R->dataoffset()];
1368     else if (Rtype.basetype == TypeDesc::INT)
1369         Rdefault = &inst()->m_iparams[R->dataoffset()];
1370     else if (Rtype.basetype == TypeDesc::STRING)
1371         Rdefault = &inst()->m_sparams[R->dataoffset()];
1372     OSL_DASSERT(Rdefault != NULL);
1373     R->data (Rdefault);
1374 
1375     // Get rid of any init ops
1376     if (R->has_init_ops()) {
1377         turn_into_nop (R->initbegin(), R->initend(), "init ops not needed");
1378         R->initbegin (0);
1379         R->initend (0);
1380     }
1381     // Erase R's incoming connections
1382     erase_if (inst()->connections(), ConnectionDestIs(*inst(),R));
1383 }
1384 
1385 
1386 
1387 /// Check for conditions under which assignments to output parameters
1388 /// can be removed.
1389 ///
1390 /// Return true if the assignment is removed entirely.
1391 bool
outparam_assign_elision(int opnum,Opcode & op)1392 RuntimeOptimizer::outparam_assign_elision (int opnum, Opcode &op)
1393 {
1394     OSL_DASSERT (op.opname() == u_assign);
1395     Symbol *R (inst()->argsymbol(op.firstarg()+0));
1396     Symbol *A (inst()->argsymbol(op.firstarg()+1));
1397 
1398     if (R->symtype() != SymTypeOutputParam)
1399         return false;    // This logic is only about output params
1400 
1401     // Check for assignment of output params that are written only once
1402     // in the whole shader -- on this statement -- and assigned a
1403     // constant, and the assignment is unconditional.  In that case,
1404     // just alias it to the constant from here on out.
1405     if (// R is being assigned a constant of the right type:
1406         A->is_constant() && R->typespec() == A->typespec()
1407                 // FIXME -- can this be equivalent() rather than == ?
1408         // and it's written only on this op, and unconditionally:
1409         && R->firstwrite() == opnum && R->lastwrite() == opnum
1410         && !m_in_conditional[opnum]
1411         // and this is not a case of an init op for an output param that
1412         // actually will get an instance value or a connection:
1413         && ! ((R->valuesource() == Symbol::InstanceVal || R->connected())
1414               && R->initbegin() <= opnum && R->initend() > opnum)
1415         ) {
1416         // Alias it to the constant it's being assigned
1417         int cind = inst()->args()[op.firstarg()+1];
1418         global_alias (inst()->args()[op.firstarg()], cind);
1419         // If it's also never read before this assignment and isn't a
1420         // designated renderer output (which we obviously must write!), just
1421         // replace its default value entirely and get rid of the assignment.
1422         if (R->firstread() > opnum && ! R->renderer_output() &&
1423                 m_opt_elide_unconnected_outputs) {
1424             make_param_use_instanceval (R, Strutil::sprintf("- written once, with a constant (%s), before any reads", const_value_as_string(*A)));
1425             replace_param_value (R, A->data(), A->typespec());
1426             turn_into_nop (op, debug() > 1 ? Strutil::sprintf("oparam %s never subsequently read or connected", R->name()).c_str() : "");
1427             return true;
1428         }
1429     }
1430 
1431     // If the output param will neither be read later in the shader nor
1432     // connected to a downstream layer, then we don't really need this
1433     // assignment at all. Note that unread_after() does take into
1434     // consideration whether it's a renderer output.
1435     if (unread_after(R,opnum)) {
1436         turn_into_nop (op, debug() > 1 ? Strutil::sprintf("oparam %s never subsequently read or connected", R->name()).c_str() : "");
1437         return true;
1438     }
1439 
1440     return false;
1441 }
1442 
1443 
1444 
1445 
1446 /// If every potentially-written argument to this op is NEVER read, turn
1447 /// it into a nop and return true.  We don't do this to ops that have no
1448 /// written args at all, since they tend to have side effects (e.g.,
1449 /// printf, setmessage).
1450 bool
useless_op_elision(Opcode & op,int opnum)1451 RuntimeOptimizer::useless_op_elision (Opcode &op, int opnum)
1452 {
1453     if (op.nargs()) {
1454         bool writes_something = false;
1455         for (int a = 0;  a < op.nargs();  ++a) {
1456             if (op.argwrite(a)) {
1457                 writes_something = true;
1458                 Symbol *A = opargsym (op, a);
1459                 if (! unread_after(A,opnum))
1460                     return false;
1461             }
1462         }
1463         // If we get this far, nothing written had any effect
1464         if (writes_something) {
1465             // Enumerate exceptions -- ops that write something, but have
1466             // side effects that means they shouldn't be eliminated.
1467             if (op.opname() == u_pointcloud_write)
1468                 return false;
1469             // It's a useless op, eliminate it
1470             turn_into_nop (op, "eliminated op whose writes will never be read");
1471             return true;
1472         }
1473     }
1474     return false;
1475 }
1476 
1477 
1478 
1479 int
dealias_symbol(int symindex,int opnum)1480 RuntimeOptimizer::dealias_symbol (int symindex, int opnum)
1481 {
1482     do {
1483         int i = block_alias (symindex);
1484         if (i >= 0) {
1485             // block-specific alias for the sym
1486             symindex = i;
1487             continue;
1488         }
1489         FastIntMap::const_iterator found;
1490         found = m_symbol_aliases.find (symindex);
1491         if (found != m_symbol_aliases.end()) {
1492             // permanent alias for the sym
1493             symindex = found->second;
1494             continue;
1495         }
1496         if (inst()->symbol(symindex)->symtype() == SymTypeParam &&
1497             opnum >= inst()->maincodebegin()) {
1498             // Only check parameter aliases for main code
1499             found = m_param_aliases.find (symindex);
1500             if (found != m_param_aliases.end()) {
1501                 symindex = found->second;
1502                 continue;
1503             }
1504         }
1505     } while (0);
1506     return symindex;
1507 }
1508 
1509 
1510 
1511 void
block_unalias(int symindex)1512 RuntimeOptimizer::block_unalias (int symindex)
1513 {
1514     FastIntMap::iterator i = m_block_aliases.find (symindex);
1515     if (i != m_block_aliases.end())
1516         i->second = -1;
1517     // In addition to the current block_aliases, unalias from any
1518     // saved alias lists.
1519     for (auto& ba : m_block_aliases_stack) {
1520         FastIntMap::iterator i = ba->find (symindex);
1521         if (i != ba->end())
1522             i->second = -1;
1523     }
1524 }
1525 
1526 
1527 
1528 /// Make sure there's room for at least one more symbol, so that we can
1529 /// add a const if we need to, without worrying about the addresses of
1530 /// symbols changing if we add a new one soon.
1531 void
make_symbol_room(int howmany)1532 RuntimeOptimizer::make_symbol_room (int howmany)
1533 {
1534     inst()->make_symbol_room (howmany);
1535 }
1536 
1537 
1538 
1539 
1540 // Predicate to test if a symbol (specified by symbol index, symbol
1541 // pointer, or by the inbound Connection record) is never used within
1542 // the shader or passed along.  Subtlety: you can't base the test for
1543 // params on sym->everused(), since of course it may be used within its
1544 // own init ops, but then never subsequently used, and thus be a prime
1545 // candidate for culling.  Instead, for params we test whether it was
1546 // used at any point AFTER its init ops.
1547 class SymNeverUsed
1548 {
1549 public:
SymNeverUsed(const RuntimeOptimizer & rop,const ShaderInstance * inst)1550     SymNeverUsed (const RuntimeOptimizer &rop, const ShaderInstance *inst)
1551         : m_rop(rop), m_inst(inst)
1552     { }
operator ()(const Symbol & sym) const1553     bool operator() (const Symbol &sym) const {
1554         if (sym.symtype() == SymTypeParam)
1555             return (sym.lastuse() < sym.initend()) && !sym.connected_down();
1556         if (sym.symtype() == SymTypeOutputParam) {
1557             if (! m_rop.opt_elide_unconnected_outputs())
1558                 return false;   // Asked not to do this optimization
1559             if (sym.connected_down())
1560                 return false;   // Connected to something downstream
1561             if (sym.renderer_output())
1562                 return false;   // This is a renderer output
1563             return (sym.lastuse() < sym.initend());
1564         }
1565         return ! sym.everused();  // all other symbol types
1566     }
operator ()(int symid) const1567     bool operator() (int symid) const {
1568         return (*this)(*m_inst->symbol(symid));
1569     }
operator ()(const Connection & c) const1570     bool operator() (const Connection &c) const {
1571         return (*this)(c.dst.param);
1572     }
1573 private:
1574     const RuntimeOptimizer &m_rop;
1575     const ShaderInstance *m_inst;
1576 };
1577 
1578 
1579 
1580 int
next_block_instruction(int opnum)1581 RuntimeOptimizer::next_block_instruction (int opnum)
1582 {
1583     int end = (int)inst()->ops().size();
1584     for (int n = opnum+1; n < end && m_bblockids[n] == m_bblockids[opnum]; ++n)
1585         if (inst()->ops()[n].opname() != u_nop && inst()->ops()[n].opname() != u_functioncall_nr)
1586             return n;   // Found it!
1587     return 0;   // End of ops or end of basic block
1588 }
1589 
1590 
1591 
1592 int
peephole2(int opnum,int op2num)1593 RuntimeOptimizer::peephole2 (int opnum, int op2num)
1594 {
1595     Opcode &op (inst()->ops()[opnum]);
1596     Opcode &next (inst()->ops()[op2num]);
1597 
1598     // N.B. Some of these transformations may look strange, you may
1599     // think "nobody will write code that does that", but (a) they do;
1600     // and (b) it can end up like that after other optimizations have
1601     // changed the code around.
1602 
1603     // Ping-pong assignments can eliminate the second one:
1604     //     assign a b
1605     //     assign b a    <-- turn into nop
1606     // But note that if a is an int and b is a float, this transformation
1607     // is not safe because of the intentional truncation.
1608     if (op.opname() == u_assign && next.opname() == u_assign) {
1609         Symbol *a = opargsym(op,0);
1610         Symbol *b = opargsym(op,1);
1611         Symbol *c = opargsym(next,0);
1612         Symbol *d = opargsym(next,1);
1613         if (a == d && b == c) {
1614             // Exclude the integer truncation case
1615             if (! (a->typespec().is_int() && b->typespec().is_float_based())) {
1616                 // std::cerr << "ping-pong assignment " << opnum << " of "
1617                 //           << opargsym(op,0)->mangled() << " and "
1618                 //           << opargsym(op,1)->mangled() << "\n";
1619                 turn_into_nop (next, "ping-pong assignments");
1620                 return 1;
1621             }
1622         }
1623     }
1624 
1625     // Daisy chain assignments -> use common source
1626     //     assign a b
1627     //     assign c a
1628     // turns into:
1629     //     assign a b
1630     //     assign c b
1631     // This may allow a to be eliminated if it's not used elsewhere.
1632     // But note that this doesn't work for float = int = float,
1633     // which intentionally truncates before the assignment to c!
1634     if (op.opname() == u_assign && next.opname() == u_assign) {
1635         Symbol *a = opargsym(op,0);
1636         Symbol *b = opargsym(op,1);
1637         Symbol *c = opargsym(next,0);
1638         Symbol *d = opargsym(next,1);
1639         if (a == d && assignable (c->typespec(), b->typespec())) {
1640             // Exclude the float=int=float case
1641             if (! (a->typespec().is_int() && b->typespec().is_float_based() &&
1642                    c->typespec().is_float_based() && !c->typespec().is_array())) {
1643                 turn_into_assign (next, inst()->arg(op.firstarg()+1),
1644                                   "daisy-chain assignments");
1645                 return 1;
1646             }
1647         }
1648     }
1649 
1650     // Look for adjacent add and subtract of the same value:
1651     //     add a a b
1652     //     sub a a b
1653     // (or vice versa)
1654     if (((op.opname() == u_add && next.opname() == u_sub) ||
1655          (op.opname() == u_sub && next.opname() == u_add)) &&
1656           opargsym(op,0) == opargsym(next,0) &&
1657           opargsym(op,1) == opargsym(next,1) &&
1658           opargsym(op,2) == opargsym(next,2) &&
1659           opargsym(op,0) == opargsym(op,1)) {
1660         // std::cerr << "dueling add/sub " << opnum << " & " << op2num << ": "
1661         //           << opargsym(op,0)->mangled() << "\n";
1662         turn_into_nop (op, "simplify add/sub pair");
1663         turn_into_nop (next, "simplify add/sub pair");
1664         return 2;
1665     }
1666 
1667     // Look for add of a value then subtract of the same value
1668     //     add a b c     or:    sub a b c
1669     //     sub d a c            add d a c
1670     // the second instruction should be changed to
1671     //     assign d b
1672     // and furthermore, if the only use of a is on these two lines or
1673     // if a == d, then the first instruction can be changed to a 'nop'.
1674     // Careful, "only used on these two lines" can be tricky if 'a' is a
1675     // global or output parameter, which are used after the shader finishes!
1676     if (((op.opname() == u_add && next.opname() == u_sub) ||
1677          (op.opname() == u_sub && next.opname() == u_add)) &&
1678         opargsym(op,0) == opargsym(next,1) &&
1679         opargsym(op,2) == opargsym(next,2) &&
1680         opargsym(op,0) != opargsym(next,2) /* a != c */) {
1681         Symbol *a = opargsym(op,0);
1682         Symbol *d = opargsym(next,0);
1683         turn_into_assign (next, oparg(op,1)/*b*/, "simplify add/sub pair");
1684         if ((a->firstuse() >= opnum && a->lastuse() <= op2num &&
1685              ((a->symtype() != SymTypeGlobal && a->symtype() != SymTypeOutputParam)))
1686             || a == d) {
1687             turn_into_nop (op, "simplify add/sub pair");
1688             return 2;
1689         }
1690         else
1691             return 1;
1692     }
1693 
1694     // Look for simple functions followed by an assignment:
1695     //    OP a b...
1696     //    assign c a
1697     // If OP is "simple" (completely overwrites its first argument, only
1698     // reads the rest), and a and c are the same type, and a is never
1699     // used again, then we can replace those two instructions with:
1700     //    OP c b...
1701     // Careful, "never used again" can be tricky if 'a' is a global or
1702     // output parameter, which are used after the shader finishes!
1703     if (next.opname() == u_assign &&
1704         op.nargs() >= 1 && opargsym(op,0) == opargsym(next,1) &&
1705         is_simple_assign(op)) {
1706         Symbol *a = opargsym(op,0);
1707         Symbol *c = opargsym(next,0);
1708         if (a->firstuse() >= opnum && a->lastuse() <= op2num &&
1709               (a->symtype() != SymTypeGlobal && a->symtype() != SymTypeOutputParam) &&
1710               equivalent (a->typespec(), c->typespec())) {
1711             if (debug() > 1)
1712                 debug_opt_ops (opnum, opnum+1,
1713                                Strutil::sprintf ("turned '%s %s...' to '%s %s...' as part of daisy-chain",
1714                                  op.opname(), a->name(), op.opname(), c->name()));
1715             inst()->args()[op.firstarg()] = inst()->args()[next.firstarg()];
1716             c->mark_rw (opnum, false, true);
1717             // Any time we write to a variable that wasn't written to at
1718             // this op previously, we need to block_unalias it, or it
1719             // can dealias to the wrong thing when examining subsequent
1720             // instructions.
1721             block_unalias (oparg(op,0));  // clear any aliases
1722             turn_into_nop (next, "daisy-chain op and assignment");
1723             return 2;
1724         }
1725     }
1726 
1727     // Convert this combination
1728     //     closure A name arg...
1729     //     mul B A weight
1730     // into
1731     //     closure B C name arg...
1732     // That is, collapse a creation and immediate scale of a closure into
1733     // a single closure-with-scale constructor. (Valid if A is not used
1734     // elsewhere.)  Further refinement: if weight = 1, no need to do
1735     // the scale, and if weight == 0, eliminate the work entirely.
1736     // We only do this optimization on pass > 1, to give a fair chance
1737     // for other optimizations to be able to turn the weight into a
1738     // constant before we do this one (since if it's 1 or 0, we can
1739     // simplify further).
1740     if (op.opname() == u_closure && next.opname() == u_mul
1741           && optimization_pass() > 1) {
1742         Symbol *a = opargsym(op,0);
1743         Symbol *name = opargsym(op,1);
1744         Symbol *aa = opargsym(next,1);
1745         Symbol *weight = opargsym(next,2);
1746         int weightarg = 2;
1747         if (weight->typespec().is_closure()) {  // opposite order
1748             std::swap (aa, weight);
1749             weightarg = 1;
1750         }
1751         if (name->typespec().is_string() &&
1752             a->firstuse() >= opnum && a->lastuse() <= op2num &&
1753             a == aa && weight->typespec().is_triple()) {
1754             if (is_zero(*weight)) {
1755                 turn_into_nop (op, "zero-weighted closure");
1756                 turn_into_assign (next, add_constant(0.0f),
1757                                   "zero-weighted closure");
1758                 return 1;
1759             }
1760             // FIXME - handle weight being a float as well
1761             std::vector<int> newargs;
1762             newargs.push_back (oparg(next,0)); // B
1763             if (! is_one(*weight))
1764                 newargs.push_back (oparg(next,weightarg)); // weight
1765             for (int i = 1;  i < op.nargs();  ++i)
1766                 newargs.push_back (oparg(op,i));
1767             turn_into_nop (op, "combine closure+mul");
1768             turn_into_nop (next, "combine closure+mul");
1769             insert_code (opnum, u_closure, newargs,
1770                          RecomputeRWRanges, GroupWithNext);
1771             if (debug() > 1)
1772                 std::cout << "op " << opnum << "-" << (op2num)
1773                           << " combined closure+mul\n";
1774             return 1;
1775         }
1776     }
1777 
1778     // No changes
1779     return 0;
1780 }
1781 
1782 
1783 
1784 /// Mark our params that feed to later layers, and whether we have any
1785 /// outgoing connections.
1786 void
mark_outgoing_connections()1787 RuntimeOptimizer::mark_outgoing_connections ()
1788 {
1789     OSL_ASSERT (! inst()->m_instoverrides.size() &&
1790                "don't call this before copy_code_from_master");
1791     inst()->outgoing_connections (false);
1792     FOREACH_PARAM (auto&& s, inst())
1793         s.connected_down (false);
1794     for (int lay = layer()+1;  lay < group().nlayers();  ++lay) {
1795         for (auto&& c : group()[lay]->m_connections)
1796             if (c.srclayer == layer()) {
1797                 inst()->symbol(c.src.param)->connected_down (true);
1798                 inst()->outgoing_connections (true);
1799             }
1800     }
1801 }
1802 
1803 
1804 
1805 /// Check all params and output params to find any that are neither used
1806 /// in the shader (aside from their own init ops, which shouldn't count)
1807 /// nor connected to downstream layers, and for those, remove their init
1808 /// ops and connections.
1809 /// Precondition: mark_outgoing_connections should be up to date.
1810 int
remove_unused_params()1811 RuntimeOptimizer::remove_unused_params ()
1812 {
1813     int alterations = 0;
1814     SymNeverUsed param_never_used (*this, inst());  // handy predicate
1815 
1816     // Get rid of unused params' init ops and clear their read/write ranges
1817     FOREACH_PARAM (auto&& s, inst()) {
1818         if (param_never_used(s) && s.has_init_ops()) {
1819             std::string why;
1820             if (debug() > 1)
1821                 why = Strutil::sprintf ("remove init ops of unused param %s %s", s.typespec(), s.name());
1822             turn_into_nop (s.initbegin(), s.initend(), why);
1823             s.set_initrange (0, 0);
1824             s.clear_rw();   // mark as totally unused
1825             ++alterations;
1826         }
1827     }
1828 
1829     // Get rid of the Connections themselves
1830     if (debug() > 1) {
1831         for (auto&& c : inst()->connections()) {
1832             if (param_never_used(c)) {
1833                 debug_optf("  Connection no longer needed: %s %s\n",
1834                            group()[c.srclayer]->layername(),
1835                            c.str(group(), inst()));
1836             }
1837         }
1838     }
1839     erase_if (inst()->connections(), param_never_used);
1840 
1841     return alterations;
1842 }
1843 
1844 
1845 
1846 void
catalog_symbol_writes(int opbegin,int opend,FastIntSet & syms)1847 RuntimeOptimizer::catalog_symbol_writes (int opbegin, int opend,
1848                                          FastIntSet &syms)
1849 {
1850     for (int i = opbegin; i < opend; ++i) {
1851         const Opcode &op (inst()->ops()[i]);
1852         for (int a = 0, nargs = op.nargs();  a < nargs;  ++a) {
1853             if (op.argwrite(a))
1854                 syms.insert (oparg (op, a));
1855         }
1856     }
1857 }
1858 
1859 
1860 
1861 /// Find situations where an output is simply a copy of a connected
1862 /// input, and eliminate the middleman.
1863 int
eliminate_middleman()1864 RuntimeOptimizer::eliminate_middleman ()
1865 {
1866     int changed = 0;
1867     FOREACH_PARAM (auto&& s, inst()) {
1868         // Skip if this isn't a shader output parameter that's connected
1869         // to a later layer.
1870         if (s.symtype() != SymTypeOutputParam || !s.connected_down())
1871             continue;
1872         // If it's written more than once, or has init ops, don't bother
1873         if (s.firstwrite() != s.lastwrite() || s.has_init_ops())
1874             continue;
1875         // Ok, s is a connected output, written only once, without init ops.
1876 
1877         // If the one time it's written isn't a simple assignment, never mind
1878         int opnum = s.firstwrite();
1879         Opcode &op (inst()->ops()[opnum]);
1880         if (op.opname() != u_assign)
1881             continue;   // only consider direct assignments
1882         // Now what's it assigned from?  If it's not a connected
1883         // parameter, or if it's not an equivalent data type, or if it's
1884         // a closure, never mind.
1885         int src_index = oparg (op, 1);
1886         Symbol *src = opargsym (op, 1);
1887 
1888         if (! (src->symtype() == SymTypeParam && src->connected()) ||
1889               ! equivalent(src->typespec(), s.typespec()) ||
1890               s.typespec().is_closure())
1891             continue;
1892 
1893         // Only works if the assignment is unconditional.  Needs to not
1894         // be in a conditional or loop, and not have any exit or return
1895         // statement before the assignment.
1896         if (! op_is_unconditionally_executed (opnum))
1897             continue;
1898 
1899         // OK, output param 's' is simply and unconditionally assigned
1900         // the value of the equivalently-typed input parameter 'src'.
1901         // Doctor downstream shaders that use s to connect directly to
1902         // src.
1903 
1904         // First, find what src is connected to.
1905         int upstream_layer = -1, upstream_symbol = -1;
1906         for (int i = 0, e = inst()->nconnections();  i < e;  ++i) {
1907             const Connection &c = inst()->connection(i);
1908             if (c.dst.param == src_index &&  // the connection we want
1909                 c.src.is_complete() && c.dst.is_complete() &&
1910                 equivalent(c.src.type,c.dst.type) &&
1911                 !c.src.type.is_closure() && ! c.dst.type.is_closure()) {
1912                 upstream_layer = c.srclayer;
1913                 upstream_symbol = c.src.param;
1914                 break;
1915             }
1916         }
1917         if (upstream_layer < 0 || upstream_symbol < 0)
1918             continue;  // not a complete connection, forget it
1919 
1920         ShaderInstance *upinst = group()[upstream_layer];
1921         if (debug() > 1)
1922             std::cout << "Noticing that " << inst()->layername() << "."
1923                       << s.name() << " merely copied from " << src->name()
1924                       << ", connected from " << upinst->layername() << "."
1925                       << upinst->symbol(upstream_symbol)->name() << "\n";
1926 
1927         // Find all the downstream connections of s, make them
1928         // connections to src.
1929         int s_index = inst()->symbolindex(&s);
1930         for (int laynum = layer()+1;  laynum < group().nlayers();  ++laynum) {
1931             ShaderInstance *downinst = group()[laynum];
1932             for (int i = 0, e = downinst->nconnections();  i < e;  ++i) {
1933                 Connection &c = downinst->connections()[i];
1934                 if (c.srclayer == layer() && // connected to our layer
1935                     c.src.param == s_index && // connected to s
1936                     c.src.is_complete() && c.dst.is_complete() &&
1937                     equivalent(c.src.type,c.dst.type)) {
1938                     // just change the connection's referrant to the
1939                     // upstream source of s.
1940                     c.srclayer = upstream_layer;
1941                     c.src.param = upstream_symbol;
1942                     ++changed;
1943                     shadingsys().m_stat_middlemen_eliminated += 1;
1944                     if (debug() > 1) {
1945                         const Symbol *dsym = downinst->symbol(c.dst.param);
1946                         if (! dsym)
1947                             dsym = downinst->mastersymbol(c.dst.param);
1948                         const Symbol *usym = upinst->symbol(upstream_symbol);
1949                         if (! usym)
1950                             usym = upinst->mastersymbol(upstream_symbol);
1951                         OSL_DASSERT (dsym && usym);
1952                         std::cout << "Removed " << inst()->layername() << "."
1953                                   << s.name() << " middleman for "
1954                                   << downinst->layername() << "."
1955                                   << dsym->name() << ", now connected to "
1956                                   << upinst->layername() << "."
1957                                   << usym->name() << "\n";
1958                     }
1959                 }
1960             }
1961         }
1962     }
1963     return changed;
1964 }
1965 
1966 
1967 
1968 int
optimize_assignment(Opcode & op,int opnum)1969 RuntimeOptimizer::optimize_assignment (Opcode &op, int opnum)
1970 {
1971     // Various optimizations specific to assignment statements
1972     OSL_DASSERT (op.opname() == u_assign);
1973     int changed = 0;
1974     Symbol *R (inst()->argsymbol(op.firstarg()+0));
1975     Symbol *A (inst()->argsymbol(op.firstarg()+1));
1976     bool R_local_or_tmp = (R->symtype() == SymTypeLocal ||
1977                            R->symtype() == SymTypeTemp);
1978     if (block_alias(inst()->arg(op.firstarg())) == inst()->arg(op.firstarg()+1) ||
1979         block_alias(inst()->arg(op.firstarg()+1)) == inst()->arg(op.firstarg())) {
1980         // We're re-assigning something already aliased, skip it
1981         turn_into_nop (op, "reassignment of current value (2)");
1982         return ++changed;
1983     }
1984     if (coerce_assigned_constant (op)) {
1985         // A may have changed, so we need to reset it
1986         A = inst()->argsymbol(op.firstarg()+1);
1987         ++changed;
1988     }
1989     // NOW do assignment constant folding, only after we
1990     // have performed all the other transformations that may
1991     // turn this op into an assignment.
1992     changed += constfold_assign (*this, opnum);
1993     if (op.opname() != u_assign) {
1994         // The const fold has changed the assignment to something
1995         // other than assign (presumably nop), so skip the other
1996         // assignment transformations below.
1997         return 0;
1998     }
1999     if ((A->is_constant() || A->lastwrite() < opnum) &&
2000         equivalent(R->typespec(), A->typespec())) {
2001         // Safe to alias R to A for this block, if A is a
2002         // constant or if it's never written to again.
2003         block_alias (inst()->arg(op.firstarg()),
2004                          inst()->arg(op.firstarg()+1));
2005         // std::cerr << opnum << " aliasing " << R->mangled() << " to "
2006         //       << inst()->argsymbol(op.firstarg()+1)->mangled() << "\n";
2007     }
2008     if (A->is_constant() && R->typespec() == A->typespec() &&
2009         R_local_or_tmp &&
2010         R->firstwrite() == opnum && R->lastwrite() == opnum) {
2011         // This local or temp is written only once in the
2012         // whole shader -- on this statement -- and it's
2013         // assigned a constant.  So just alias it to the
2014         // constant.
2015         int cind = inst()->args()[op.firstarg()+1];
2016         global_alias (inst()->args()[op.firstarg()], cind);
2017         turn_into_nop (op, "replace symbol with constant");
2018         return ++changed;
2019     }
2020     if (R_local_or_tmp && ! R->everread()) {
2021         // This local is written but NEVER READ.  nop it.
2022         turn_into_nop (op, "local/tmp never read");
2023         return ++changed;
2024     }
2025     if (outparam_assign_elision (opnum, op)) {
2026         return ++changed;
2027     }
2028     if (R == A) {
2029         // Just an assignment to itself -- turn into NOP!
2030         turn_into_nop (op, "self-assignment");
2031         return ++changed;
2032     } else if (R_local_or_tmp && R->lastread() < opnum
2033                && ! m_in_loop[opnum]) {
2034         // Don't bother assigning if we never read it again
2035         turn_into_nop (op, "symbol never read again");
2036         return ++changed;
2037     }
2038     return changed;
2039 }
2040 
2041 
2042 
2043 void
copy_block_aliases(const FastIntMap & old_block_aliases,FastIntMap & new_block_aliases,const FastIntSet * excluded,bool copy_temps)2044 RuntimeOptimizer::copy_block_aliases (const FastIntMap &old_block_aliases,
2045                                       FastIntMap &new_block_aliases,
2046                                       const FastIntSet *excluded,
2047                                       bool copy_temps)
2048 {
2049     OSL_ASSERT (&old_block_aliases != &new_block_aliases &&
2050                 "copy_block_aliases does not work in-place");
2051     // Find all symbols written anywhere in the instruction range
2052     new_block_aliases.clear ();
2053     new_block_aliases.reserve (old_block_aliases.size());
2054     for (auto&& oba : old_block_aliases) {
2055         if (oba.second < 0)
2056             continue;    // erased alias -- don't copy
2057         if (! copy_temps && (inst()->symbol(oba.first)->is_temp() ||
2058                              inst()->symbol(oba.second)->is_temp()))
2059             continue;    // don't copy temp aliases unless told to
2060         if (excluded && (excluded->find(oba.first) != excluded->end() ||
2061                          excluded->find(oba.second) != excluded->end()))
2062             continue;    // don't copy from excluded list
2063         new_block_aliases[oba.first] = oba.second;
2064     }
2065 }
2066 
2067 
2068 
2069 int
optimize_ops(int beginop,int endop,FastIntMap * seed_block_aliases)2070 RuntimeOptimizer::optimize_ops (int beginop, int endop,
2071                                 FastIntMap *seed_block_aliases)
2072 {
2073     if (beginop >= endop)
2074         return 0;
2075 
2076     // Constant aliases valid for just this basic block
2077     clear_block_aliases ();
2078 
2079     // Provide a place where, if we recurse, we can save prior block
2080     // aliases. Register them on the block_aliases_stack so that calls to
2081     // block_unalias() will unalias from there, too.
2082     FastIntMap saved_block_aliases;
2083     m_block_aliases_stack.push_back (&saved_block_aliases);
2084 
2085     int lastblock = -1;
2086     int skipops = 0;   // extra inserted ops to skip over
2087     int changed = 0;
2088     size_t num_ops = inst()->ops().size();
2089     size_t old_num_ops = num_ops;   // track when it changes
2090     for (int opnum = beginop;  opnum < endop;  opnum += 1) {
2091         OSL_DASSERT (old_num_ops == num_ops); // better not happen unknowingly
2092         OSL_DASSERT(num_ops == inst()->ops().size());
2093         OSL_DASSERT(size_t(opnum) < inst()->ops().size());
2094         if (m_stop_optimizing)
2095             break;
2096         Opcode *op = &inst()->ops()[opnum];
2097         if (skipops) {
2098             // If a previous optimization inserted ops and told us
2099             // to skip over the new ones, we still need to unalias
2100             // any symbols written by this op, but otherwise skip
2101             // all subsequent optimizations until we run down the
2102             // skipops counter.
2103             block_unalias_written_args (*op);
2104             OSL_ASSERT (lastblock == m_bblockids[opnum] &&
2105                         "this should not be a new basic block");
2106             --skipops;
2107             continue;   // Move along to the next op, no optimization here
2108         }
2109         // Things to do if we've just moved to a new basic block
2110         if (lastblock != m_bblockids[opnum]) {
2111             clear_block_aliases (seed_block_aliases);
2112             seed_block_aliases = NULL; // only the first time
2113             clear_stale_syms ();
2114             lastblock = m_bblockids[opnum];
2115         }
2116         // Things to do at the start of main code:
2117         // * Alias output params to their initial values, if known.
2118         if (opnum == inst()->m_maincodebegin) {
2119             for (int i = inst()->firstparam();  i < inst()->lastparam();  ++i) {
2120                 Symbol *s (inst()->symbol(i));
2121                 if (s->symtype() == SymTypeOutputParam && s->lockgeom() &&
2122                       (s->valuesource() == Symbol::DefaultVal ||
2123                        s->valuesource() == Symbol::InstanceVal) &&
2124                       ! s->has_init_ops() &&
2125                       ! s->typespec().is_closure_based() &&
2126                       ! s->typespec().is_structure_based()) {
2127                     make_symbol_room (1);  // Make sure add_constant is ok
2128                     s = inst()->symbol(i);
2129                     int cind = add_constant (s->typespec(), s->data());
2130                     block_alias (i, cind); // Alias this symbol to the new const
2131                 }
2132             }
2133         }
2134         // Nothing below here to do for no-ops, take early out.
2135         if (op->opname() == u_nop || op->opname() == u_functioncall_nr)
2136             continue;
2137         // De-alias the readable args to the op and figure out if
2138         // there are any constants involved.
2139         for (int i = 0, e = op->nargs();  i < e;  ++i) {
2140             if (! op->argwrite(i)) { // Don't de-alias args that are written
2141                 int argindex = op->firstarg() + i;
2142                 int argsymindex = dealias_symbol (inst()->arg(argindex), opnum);
2143                 inst()->args()[argindex] = argsymindex;
2144             }
2145             if (op->argread(i))
2146                 use_stale_sym (oparg(*op,i));
2147         }
2148 
2149         const OpDescriptor *opd = shadingsys().op_descriptor (op->opname());
2150         // If it's a simple assignment and the lvalue is "stale", go
2151         // back and eliminate its last assignment.
2152         if (is_simple_assign(*op, opd))
2153             simple_sym_assign (oparg (*op, 0), opnum);
2154         // Make sure there's room for several more symbols, so that we
2155         // can add a few consts if we need to, without worrying about
2156         // the addresses of symbols changing when we add a new one below.
2157         make_symbol_room (max_new_consts_per_fold);
2158         // For various ops that we know how to effectively
2159         // constant-fold, dispatch to the appropriate routine.
2160         if (optimize() >= 2 && m_opt_constant_fold) {
2161             if (opd && opd->folder) {
2162                 int c = (*opd->folder) (*this, opnum);
2163                 if (c) {
2164                     changed += c;
2165                     // Re-check num_ops in case the folder inserted something
2166                     num_ops = inst()->ops().size();
2167                     skipops = num_ops - old_num_ops;
2168                     endop += num_ops - old_num_ops; // adjust how far we loop
2169                     old_num_ops = num_ops;
2170                     op = &inst()->ops()[opnum];  // in case ops resized
2171                 }
2172             }
2173         }
2174         // Clear local block aliases for any args that were written
2175         // by this op
2176         block_unalias_written_args (*op);
2177 
2178         // Now we handle assignments.
2179         if (optimize() >= 2 && op->opname() == u_assign && m_opt_assign)
2180             changed += optimize_assignment (*op, opnum);
2181         if (optimize() >= 2 && m_opt_elide_useless_ops && opd
2182             && !(opd->flags & OpDescriptor::SideEffects))
2183             changed += useless_op_elision (*op, opnum);
2184         if (m_stop_optimizing)
2185             break;
2186         // Peephole optimization involving pair of instructions (the second
2187         // instruction will be in the same basic block.
2188         if (optimize() >= 2 && m_opt_peephole && op->opname() != u_nop && op->opname() != u_functioncall_nr) {
2189             // Find the next instruction in the same basic block
2190             int op2num = next_block_instruction (opnum);
2191             if (op2num) {
2192                 int c = peephole2 (opnum, op2num);
2193                 if (c) {
2194                     changed += c;
2195                     // Re-check num_ops in case the folder inserted something
2196                     num_ops = inst()->ops().size();
2197                     // skipops = num_ops - old_num_ops;
2198                     endop += num_ops - old_num_ops; // adjust how far we loop
2199                     old_num_ops = num_ops;
2200                     op = &inst()->ops()[opnum];  // in case ops resized
2201                 }
2202             }
2203         }
2204 
2205         // Special cases for "if", "functioncall", and loops: Optimize the
2206         // sequences of instructions in the bodies recursively in a way that
2207         // allows us to be clever about the basic block alias tracking.
2208         ustring opname = op->opname();
2209         if ((opname == u_if || opname == u_functioncall ||
2210              opname == u_for || opname == u_while || opname == u_dowhile)
2211               && shadingsys().m_opt_seed_bblock_aliases) {
2212             // Find all symbols written anywhere in the instruction range
2213             // of the bodies.
2214             FastIntSet symwrites;
2215             catalog_symbol_writes (opnum+1, op->farthest_jump(), symwrites);
2216             // Save the aliases from the basic block we are exiting.
2217             // If & function call: save all prior aliases.
2218             // Loops: dont save aliases involving syms written in the loop.
2219             // Note that for both cases, we don't copy aliases involving
2220             // temps, because that breaks our later assumptions (for temp
2221             // coalescing) that temp uses never cross basic block boundaries.
2222             if (opname == u_if || opname == u_functioncall)
2223                 copy_block_aliases (m_block_aliases, saved_block_aliases);
2224             else
2225                 copy_block_aliases (m_block_aliases, saved_block_aliases,
2226                                     &symwrites);
2227             // 'if' has 2 blocks (then, else), function call has just
2228             // one (the body), loops have 4 (init, cond, body, incr),
2229             int njumps = (opname == u_if) ? 2 : (opname == u_functioncall ? 1 : 4);
2230             // Recursively optimize each body block.
2231             for (int j = 0; j < njumps; ++j) {
2232                 changed += optimize_ops (j==0 ? opnum+1 : op->jump(j-1),
2233                                          op->jump(j), &saved_block_aliases);
2234                 op = &inst()->ops()[opnum];  // in case ops resized
2235             }
2236             // Adjust optimization loop end if any instructions were added
2237             num_ops = inst()->ops().size();
2238             endop += num_ops - old_num_ops;
2239             old_num_ops = num_ops;
2240             // Now we can restore the original aliases to seed the basic
2241             // block that follows. For if/function, we need to remove all
2242             // aliases referencing syms written within the conditional or
2243             // function body. For loops, recall that we already excluded
2244             // the written syms from the saved_block_aliases.
2245             if (opname == u_if || opname == u_functioncall) {
2246                 FastIntMap restored_aliases;
2247                 restored_aliases.swap (saved_block_aliases);
2248                 // catalog again, in case optimizations in those blocks
2249                 // caused writes that weren't apparent before.
2250                 catalog_symbol_writes (opnum+1, op->farthest_jump(), symwrites);
2251                 copy_block_aliases (restored_aliases, saved_block_aliases,
2252                                     &symwrites);
2253             }
2254             seed_block_aliases = &saved_block_aliases;
2255             // Get ready to increment to the next instruction
2256             opnum = op->farthest_jump() - 1;
2257         }
2258     }
2259     m_block_aliases_stack.pop_back();  // Done with saved_block_aliases
2260     return changed;
2261 }
2262 
2263 
2264 
2265 void
optimize_instance()2266 RuntimeOptimizer::optimize_instance ()
2267 {
2268     // If "opt_layername" attribute is set, only optimize the named layer
2269     if (!shadingsys().m_opt_layername.empty() &&
2270         shadingsys().m_opt_layername != inst()->layername())
2271         return;
2272 
2273     // Make a list of the indices of all constants.
2274     for (int i = 0, e = (int)inst()->symbols().size();  i < e;  ++i)
2275         if (inst()->symbol(i)->symtype() == SymTypeConst)
2276             m_all_consts.push_back (i);
2277 
2278     // Turn all parameters with instance or default values, and which
2279     // cannot be overridden by geometry values, into constants or
2280     // aliases for globals.  Also turn connections from earlier layers'
2281     // outputs that are known to be constants or globals into constants
2282     // or global aliases without any connection.
2283     if (optimize() >= 2 && m_opt_simplify_param) {
2284         simplify_params ();
2285     }
2286 
2287 #ifndef NDEBUG
2288     // Confirm that the symbols between [firstparam,lastparam] are all
2289     // input or output params.
2290     FOREACH_PARAM (const Symbol &s, inst()) {
2291         OSL_DASSERT (s.symtype() == SymTypeParam ||
2292                      s.symtype() == SymTypeOutputParam);
2293     }
2294 #endif
2295 
2296     // Recompute which of our params have downstream connections.
2297     mark_outgoing_connections ();
2298 
2299     // Try to fold constants.  We take several passes, until we get to
2300     // the point that not much is improving.  It rarely goes beyond 3-4
2301     // passes, but we have a hard cutoff just to be sure we don't
2302     // ever get into an infinite loop from an unforseen cycle where we
2303     // end up inadvertently transforming A => B => A => etc.
2304     int totalchanged = 0;
2305     int reallydone = 0;   // Force a few passes after we think we're done
2306     int npasses = shadingsys().opt_passes();
2307     for (m_pass = 0;  m_pass < npasses;  ++m_pass) {
2308 
2309         // Once we've made one pass (and therefore called
2310         // mark_outgoing_connections), we may notice that the layer is
2311         // unused, and therefore can stop doing work to optimize it.
2312         if (m_pass != 0 && inst()->unused())
2313             break;
2314 
2315         if (m_stop_optimizing)
2316             break;
2317 
2318         if (debug() > 1)
2319             debug_optf("layer %d \"%s\", pass %d:\n",
2320                        layer(), inst()->layername(), m_pass);
2321 
2322         // Track basic blocks and conditional states
2323         find_conditionals ();
2324         find_basic_blocks ();
2325 
2326         // Clear local messages for this instance
2327         m_local_unknown_message_sent = false;
2328         m_local_messages_sent.clear ();
2329 
2330         // Figure out which params are just aliases for globals (only
2331         // necessary to do once, on the first pass).
2332         if (m_pass == 0 && optimize() >= 2)
2333             find_params_holding_globals ();
2334 
2335         // Here is the meat of the optimization, where we pass over the
2336         // code for this instance and make various transformations.
2337         int changed = optimize_ops (0, (int)inst()->ops().size());
2338 
2339         // Now that we've rewritten the code, we need to re-track the
2340         // variable lifetimes.
2341         track_variable_lifetimes ();
2342 
2343         // Recompute which of our params have downstream connections.
2344         mark_outgoing_connections ();
2345 
2346         // Find situations where an output is simply a copy of a connected
2347         // input, and eliminate the middleman.
2348         if (optimize() >= 2 && m_opt_middleman) {
2349             int c = eliminate_middleman ();
2350             if (c)
2351                 mark_outgoing_connections ();
2352             changed += c;
2353         }
2354 
2355         // Elide unconnected parameters that are never read.
2356         if (optimize() >= 1)
2357             changed += remove_unused_params ();
2358 
2359         // FIXME -- we should re-evaluate whether writes_globals() is still
2360         // true for this layer.
2361 
2362         // If nothing changed, we're done optimizing.  But wait, it may be
2363         // that after re-tracking variable lifetimes, we can notice new
2364         // optimizations!  So force another pass, then we're really done.
2365         totalchanged += changed;
2366         if (changed < 1) {
2367             if (++reallydone > 3)
2368                 break;
2369         } else {
2370             reallydone = 0;
2371         }
2372     }
2373 
2374     // A layer that was allowed to run lazily originally, if it no
2375     // longer (post-optimized) has any outgoing connections, is no
2376     // longer needed at all.
2377     if (inst()->unused()) {
2378         // Not needed.  Remove all its connections and ops.
2379         inst()->connections().clear ();
2380         turn_into_nop (0, (int)inst()->ops().size()-1,
2381                        debug() > 1 ? Strutil::sprintf("eliminate layer %s with no outward connections", inst()->layername().c_str()).c_str() : "");
2382         for (auto&& s : inst()->symbols())
2383             s.clear_rw ();
2384     }
2385 
2386     // Now that we've optimized this layer, walk through the ops and
2387     // note which messages may have been sent, so subsequent layers will
2388     // know.
2389     for (auto& op : inst()->ops()) {
2390         if (op.opname() == u_setmessage) {
2391             Symbol &Name (*inst()->argsymbol(op.firstarg()+0));
2392             if (Name.is_constant())
2393                 m_messages_sent.push_back (*(ustring *)Name.data());
2394             else
2395                 m_unknown_message_sent = true;
2396         }
2397     }
2398 }
2399 
2400 
2401 
2402 void
resolve_isconnected()2403 RuntimeOptimizer::resolve_isconnected ()
2404 {
2405     for (auto& op : inst()->ops()) {
2406         if (op.opname() == u_isconnected) {
2407             inst()->make_symbol_room (1);
2408             SymbolPtr s = inst()->argsymbol (op.firstarg() + 1);
2409             while (const StructSpec *structspec = s->typespec().structspec()) {
2410                 // How to deal with structures -- just change the reference
2411                 // to the first field in the struct.
2412                 // FIXME -- if we ever allow separate layer connection of
2413                 // individual struct members, this will need something more
2414                 // sophisticated.
2415                 OSL_DASSERT (structspec && structspec->numfields() >= 1);
2416                 std::string fieldname = (s->name().string() + "." +
2417                                          structspec->field(0).name.string());
2418                 int fieldsymid = inst()->findparam (ustring(fieldname));
2419                 OSL_DASSERT (fieldsymid >= 0);
2420                 s = inst()->symbol(fieldsymid);
2421             }
2422             bool upconnected = s->connected();
2423             if (!s->lockgeom() && shadingsys().userdata_isconnected())
2424                 upconnected = true;
2425             int val = (upconnected ? 1 : 0) + (s->connected_down() ? 2 : 0);
2426             turn_into_assign (op, add_constant(TypeDesc::TypeInt, &val),
2427                               "resolve isconnected()");
2428         }
2429     }
2430 }
2431 
2432 
2433 
2434 void
track_variable_lifetimes(const SymbolPtrVec & allsymptrs)2435 RuntimeOptimizer::track_variable_lifetimes (const SymbolPtrVec &allsymptrs)
2436 {
2437     SymbolPtrVec oparg_ptrs;
2438     oparg_ptrs.reserve (inst()->args().size());
2439     for (auto&& a : inst()->args())
2440         oparg_ptrs.push_back (inst()->symbol (a));
2441 
2442     if (m_bblockids.size() != inst()->ops().size())
2443         find_basic_blocks ();
2444 
2445     OSLCompilerImpl::track_variable_lifetimes (inst()->ops(), oparg_ptrs,
2446                                                allsymptrs, &m_bblockids);
2447 }
2448 
2449 
2450 
2451 void
track_variable_lifetimes()2452 RuntimeOptimizer::track_variable_lifetimes ()
2453 {
2454     SymbolPtrVec allsymptrs;
2455     allsymptrs.reserve (inst()->symbols().size());
2456     for (auto&& s : inst()->symbols())
2457         allsymptrs.push_back (&s);
2458 
2459     track_variable_lifetimes (allsymptrs);
2460 }
2461 
2462 
2463 // This has O(n^2) memory usage, so only for debugging
2464 //#define DEBUG_SYMBOL_DEPENDENCIES
2465 
2466 // Add to the dependency map that "symbol A depends on symbol B".
2467 void
add_dependency(SymDependency & dmap,int A,int B)2468 RuntimeOptimizer::add_dependency (SymDependency &dmap, int A, int B)
2469 {
2470     OSL_DASSERT (A < (int)inst()->symbols().size());
2471     OSL_DASSERT (B < (int)inst()->symbols().size());
2472     dmap[A].insert (B);
2473 
2474 #ifdef DEBUG_SYMBOL_DEPENDENCIES
2475     // Unification -- make all of B's dependencies be dependencies of A.
2476     for (auto&& r : dmap[B])
2477         dmap[A].insert (r);
2478 #endif
2479 }
2480 
2481 
2482 void
syms_used_in_op(Opcode & op,std::vector<int> & rsyms,std::vector<int> & wsyms)2483 RuntimeOptimizer::syms_used_in_op (Opcode &op, std::vector<int> &rsyms,
2484                                    std::vector<int> &wsyms)
2485 {
2486     rsyms.clear ();
2487     wsyms.clear ();
2488     for (int i = 0;  i < op.nargs();  ++i) {
2489         int arg = inst()->arg (i + op.firstarg());
2490         if (op.argread(i))
2491             if (std::find (rsyms.begin(), rsyms.end(), arg) == rsyms.end())
2492                 rsyms.push_back (arg);
2493         if (op.argwrite(i))
2494             if (std::find (wsyms.begin(), wsyms.end(), arg) == wsyms.end())
2495                 wsyms.push_back (arg);
2496     }
2497 }
2498 
2499 
2500 
2501 // Fake symbol index for "derivatives" entry in dependency map.
2502 static const int DerivSym = -1;
2503 
2504 
2505 // Recursively mark symbols that have derivatives from dependency map
2506 void
mark_symbol_derivatives(SymDependency & symdeps,SymIntSet & visited,int d)2507 RuntimeOptimizer::mark_symbol_derivatives (SymDependency &symdeps, SymIntSet &visited, int d)
2508 {
2509     for (auto&& r : symdeps[d]) {
2510         if (visited.find(r) == visited.end()) {
2511             visited.insert(r);
2512 
2513             Symbol *s = inst()->symbol(r);
2514 
2515             if (s->typespec().elementtype().is_float_based())
2516                 s->has_derivs (true);
2517 
2518             mark_symbol_derivatives(symdeps, visited, r);
2519         }
2520     }
2521 }
2522 
2523 
2524 /// Run through all the ops, for each one marking its 'written'
2525 /// arguments as dependent upon its 'read' arguments (and performing
2526 /// unification as we go), yielding a dependency map that lets us look
2527 /// up any symbol and see the set of other symbols on which it ever
2528 /// depends on during execution of the shader.
2529 void
track_variable_dependencies()2530 RuntimeOptimizer::track_variable_dependencies ()
2531 {
2532     SymDependency symdeps;
2533 
2534     // It's important to note that this is simplistically conservative
2535     // in that it overestimates dependencies.  To see why this is the
2536     // case, consider the following code:
2537     //       // inputs a,b; outputs x,y; local variable t
2538     //       t = a;
2539     //       x = t;
2540     //       t = b;
2541     //       y = t;
2542     // We can see that x depends on a and y depends on b.  But the
2543     // dependency analysis we do below thinks that y also depends on a
2544     // (because t depended on both a and b, but at different times).
2545     //
2546     // This naivite will never miss a dependency, but it may
2547     // overestimate dependencies.  (Hence we call this "conservative"
2548     // rather than "wrong.")  We deem this acceptable for now, since
2549     // it's so much easer to implement the conservative dependency
2550     // analysis, and it's not yet clear that getting it closer to
2551     // optimal will have any performance impact on final shaders. Also
2552     // because this is probably no worse than the "dependency slop" that
2553     // would happen with loops and conditionals.  But we certainly may
2554     // revisit with a more sophisticated algorithm if this crops up
2555     // a legitimate issue.
2556     //
2557     // Because of this conservative approach, it is critical that this
2558     // analysis is done BEFORE temporaries are coalesced (which would
2559     // cause them to be reassigned in exactly the way that confuses this
2560     // analysis).
2561 
2562     symdeps.clear ();
2563 
2564     std::vector<int> read, written;
2565     bool forcederivs = shadingsys().force_derivs();
2566     // Loop over all ops...
2567     for (auto&& op : inst()->ops()) {
2568         // Gather the list of syms read and written by the op.  Reuse the
2569         // vectors defined outside the loop to cut down on malloc/free.
2570         read.clear ();
2571         written.clear ();
2572         syms_used_in_op (op, read, written);
2573 
2574         // FIXME -- special cases here!  like if any ops implicitly read
2575         // or write to globals without them needing to be arguments.
2576 
2577         // For each symbol w written by the op...
2578         for (auto&& w : written) {
2579             // For each symbol r read by the op, make w depend on r.
2580             // (Unless r is a constant , in which case it's not necessary.)
2581             for (auto&& r : read)
2582                 if (inst()->symbol(r)->symtype() != SymTypeConst)
2583                     add_dependency (symdeps, w, r);
2584             // If the op takes derivs, make the pseudo-symbol DerivSym
2585             // depend on those arguments.
2586             if (op.argtakesderivs_all() || forcederivs) {
2587                 for (int a = 0;  a < op.nargs();  ++a)
2588                     if (op.argtakesderivs(a) || forcederivs) {
2589                         Symbol &s (*opargsym (op, a));
2590                         // Constants can't take derivs
2591                         if (s.symtype() == SymTypeConst)
2592                             continue;
2593                         // Non-float types can't take derivs
2594                         if (s.typespec().is_closure() ||
2595                             s.typespec().simpletype().basetype != TypeDesc::FLOAT)
2596                             continue;
2597                         // Careful -- not all globals can take derivs
2598                         if (s.symtype() == SymTypeGlobal &&
2599                             ! (s.mangled() == Strings::P ||
2600                                s.mangled() == Strings::I ||
2601                                s.mangled() == Strings::u ||
2602                                s.mangled() == Strings::v ||
2603                                s.mangled() == Strings::Ps))
2604                             continue;
2605                         add_dependency (symdeps, DerivSym,
2606                                         inst()->arg(a+op.firstarg()));
2607                     }
2608             }
2609         }
2610     }
2611 
2612     // Propagate derivative dependencies for any syms already known to
2613     // need derivs.  It's probably marked that way because another layer
2614     // downstream connects to it and needs derivatives of that
2615     // connection.
2616     int snum = 0;
2617     for (auto&& s : inst()->symbols()) {
2618         // Globals that get written should always provide derivs.
2619         // Exclude N, since its derivs are unreliable anyway, so no point
2620         // making it cause the whole disp shader to need derivs.
2621         if (s.symtype() == SymTypeGlobal && s.everwritten() &&
2622               !s.typespec().is_closure_based() && s.mangled() != Strings::N)
2623             s.has_derivs(true);
2624         if (s.has_derivs())
2625             add_dependency (symdeps, DerivSym, snum);
2626         ++snum;
2627     }
2628 
2629     // Mark all symbols needing derivatives as such
2630     SymIntSet visited;
2631     mark_symbol_derivatives (symdeps, visited, DerivSym);
2632 
2633     // Only some globals are allowed to have derivatives
2634     for (auto&& s : inst()->symbols()) {
2635         if (s.symtype() == SymTypeGlobal &&
2636             ! (s.mangled() == Strings::P ||
2637                s.mangled() == Strings::I ||
2638                s.mangled() == Strings::u ||
2639                s.mangled() == Strings::v ||
2640                s.mangled() == Strings::Ps))
2641             s.has_derivs (false);
2642     }
2643 
2644 #ifdef DEBUG_SYMBOL_DEPENDENCIES
2645     // Helpful for debugging
2646 
2647     std::cerr << "track_variable_dependencies\n";
2648     std::cerr << "\nDependencies:\n";
2649     for (auto&& m : symdeps) {
2650         if (m.first == DerivSym)
2651             std::cerr << "$derivs depends on ";
2652         else
2653             std::cerr << inst->symbol(m.first)->mangled() << " depends on ";
2654         for (auto&& d : m.second) {
2655             if (d == DerivSym)
2656                 std::cerr << "$derivs ";
2657             else
2658                 std::cerr << inst->symbol(d)->mangled() << ' ';
2659         }
2660         std::cerr << "\n";
2661     }
2662     std::cerr << "\n\n";
2663 
2664     // Invert the dependency
2665     SymDependency influences;
2666     for (auto&& m : symdeps)
2667         for (auto&& d : m.second)
2668             influences[d].insert (m.first);
2669 
2670     std::cerr << "\nReverse dependencies:\n";
2671     for (auto&& m : influences) {
2672         if (m.first == DerivSym)
2673             std::cerr << "$derivs contrbutes to ";
2674         else
2675             std::cerr << inst->symbol(m.first)->mangled() << " contributes to ";
2676         for (auto&& d : m.second) {
2677             if (d == DerivSym)
2678                 std::cerr << "$derivs ";
2679             else
2680                 std::cerr << inst->symbol(d)->mangled() << ' ';
2681         }
2682         std::cerr << "\n";
2683     }
2684     std::cerr << "\n\n";
2685 #endif
2686 }
2687 
2688 
2689 
2690 // Is the symbol coalescable?
2691 inline bool
coalescable(const Symbol & s)2692 coalescable (const Symbol &s)
2693 {
2694     return (s.symtype() == SymTypeTemp &&     // only coalesce temporaries
2695             s.everused() &&                   // only if they're used
2696             s.dealias() == &s &&              // only if not already aliased
2697             ! s.typespec().is_structure() &&  // only if not a struct
2698             s.fieldid() < 0);                 //    or a struct field
2699 }
2700 
2701 
2702 
2703 /// Coalesce temporaries.  During code generation, we make a new
2704 /// temporary EVERY time we need one.  Now we examine them all and merge
2705 /// ones of identical type and non-overlapping lifetimes.
2706 void
coalesce_temporaries()2707 RuntimeOptimizer::coalesce_temporaries ()
2708 {
2709     // We keep looping until we can't coalesce any more.
2710     int ncoalesced = 1;
2711     while (ncoalesced) {
2712         ncoalesced = 0;   // assume we're done, unless we coalesce something
2713 
2714         // We use a greedy algorithm that loops over each symbol, and
2715         // then examines all higher-numbered symbols (in order) and
2716         // tries to merge the first one it can find that doesn't overlap
2717         // lifetimes.  The temps were created as we generated code, so
2718         // they are already sorted by their "first use".  Thus, for any
2719         // pair t1 and t2 that are merged, it is guaranteed that t2 is
2720         // the symbol whose first use the earliest of all symbols whose
2721         // lifetimes do not overlap t1.
2722 
2723         SymbolVec::iterator s;
2724         for (s = inst()->symbols().begin(); s != inst()->symbols().end(); ++s) {
2725             // Skip syms that can't be (or don't need to be) coalesced
2726             if (! coalescable(*s))
2727                 continue;
2728 
2729             int sfirst = s->firstuse ();
2730             int slast  = s->lastuse ();
2731 
2732             // Loop through every other symbol
2733             for (SymbolVec::iterator t = s+1; t != inst()->symbols().end(); ++t) {
2734                 // Coalesce s and t if both syms are coalescable,
2735                 // equivalent types, have nonoverlapping lifetimes,
2736                 // and either both do or both do not need derivatives.
2737                 if (coalescable (*t) &&
2738                       equivalent (s->typespec(), t->typespec()) &&
2739                       s->has_derivs() == t->has_derivs() &&
2740                       (slast < t->firstuse() || sfirst > t->lastuse())) {
2741                     // Make all future t references alias to s
2742                     t->alias (&(*s));
2743                     // s gets union of the lifetimes
2744                     s->union_rw (t->firstread(), t->lastread(),
2745                                  t->firstwrite(), t->lastwrite());
2746                     sfirst = s->firstuse ();
2747                     slast  = s->lastuse ();
2748                     // t gets marked as unused
2749                     t->clear_rw ();
2750                     ++ncoalesced;
2751                 }
2752             }
2753         }
2754         // std::cerr << "Coalesced " << ncoalesced << "\n";
2755     }
2756 
2757     // Since we may have aliased temps, now we need to make sure all
2758     // symbol refs are dealiased.
2759     for (auto&& arg : inst()->args()) {
2760         Symbol *s = inst()->symbol(arg);
2761         s = s->dealias ();
2762         arg = s - inst()->symbol(0);
2763     }
2764 }
2765 
2766 
2767 
2768 void
post_optimize_instance()2769 RuntimeOptimizer::post_optimize_instance ()
2770 {
2771     inst()->evaluate_writes_globals_and_userdata_params ();
2772 
2773     if (inst()->unused())
2774         return;    // skip the expensive stuff if we're not used anyway
2775 
2776     SymbolPtrVec allsymptrs;
2777     allsymptrs.reserve (inst()->symbols().size());
2778     for (auto&& s : inst()->symbols())
2779         allsymptrs.push_back (&s);
2780 
2781     m_bblockids.clear ();       // Keep insert_code from getting confused
2782     m_in_conditional.clear ();
2783     m_in_loop.clear ();
2784 
2785     add_useparam (allsymptrs);
2786 
2787     if (optimize() >= 1 && m_opt_coalesce_temps)
2788         coalesce_temporaries ();
2789 }
2790 
2791 
2792 
2793 void
collapse_syms()2794 RuntimeOptimizer::collapse_syms ()
2795 {
2796     //
2797     // Make a new symbol table that removes all the unused symbols.
2798     //
2799 
2800     // Mark our params that feed to later layers, so that unused params
2801     // that aren't needed downstream can be removed.
2802     mark_outgoing_connections ();
2803 
2804     SymbolVec new_symbols;          // buffer for new symbol table
2805     std::vector<int> symbol_remap;  // mapping of old sym index to new
2806     int total_syms = 0;             // number of new symbols we'll need
2807     SymNeverUsed never_used (*this, inst());  // handy predicate
2808 
2809     // First, just count how many we need and set up the mapping
2810     for (auto&& s : inst()->symbols()) {
2811         symbol_remap.push_back (total_syms);
2812         if (! never_used (s))
2813             ++total_syms;
2814     }
2815 
2816     // Now make a new table of the right (new) size, and copy the used syms
2817     new_symbols.reserve (total_syms);
2818     for (auto&& s : inst()->symbols()) {
2819         if (! never_used (s))
2820             new_symbols.push_back (s);
2821     }
2822 
2823     // Remap all the function arguments to the new indices
2824     for (auto&& arg : inst()->m_instargs)
2825         arg = symbol_remap[arg];
2826 
2827     // Fix our connections from upstream shaders
2828     for (auto&& c : inst()->m_connections)
2829         c.dst.param = symbol_remap[c.dst.param];
2830 
2831     // Fix downstream connections that reference us
2832     for (int lay = layer()+1;  lay < group().nlayers();  ++lay) {
2833         for (auto&& c : group()[lay]->m_connections)
2834             if (c.srclayer == layer())
2835                 c.src.param = symbol_remap[c.src.param];
2836     }
2837 
2838     // Swap the new symbol list for the old.
2839     std::swap (inst()->m_instsymbols, new_symbols);
2840     {
2841         // adjust memory stats
2842         // Remember that they're already swapped
2843         off_t mem = vectorbytes(new_symbols) - vectorbytes(inst()->m_instsymbols);
2844         ShadingSystemImpl &ss (shadingsys());
2845         spin_lock lock (ss.m_stat_mutex);
2846         ss.m_stat_mem_inst_syms -= mem;
2847         ss.m_stat_mem_inst -= mem;
2848         ss.m_stat_memory -= mem;
2849     }
2850 
2851     // Miscellaneous cleanup of other things that used symbol indices
2852     inst()->m_Psym = -1;
2853     inst()->m_Nsym = -1;
2854     inst()->m_firstparam = -1;
2855     inst()->m_lastparam = -1;
2856     int i = 0;
2857     for (auto&& s : inst()->symbols()) {
2858         if (s.symtype() == SymTypeParam || s.symtype() == SymTypeOutputParam) {
2859             if (inst()->m_firstparam < 0)
2860                 inst()->m_firstparam = i;
2861             inst()->m_lastparam = i+1;
2862         }
2863         if (s.name() == Strings::P)
2864             inst()->m_Psym = i;
2865         else if (s.name() == Strings::N)
2866             inst()->m_Nsym = i;
2867         ++i;
2868     }
2869 #ifndef NDEBUG
2870     // Confirm that the symbols between [firstparam,lastparam] are all
2871     // input or output params.
2872     FOREACH_PARAM (const Symbol &s, inst()) {
2873         OSL_DASSERT (s.symtype() == SymTypeParam ||
2874                      s.symtype() == SymTypeOutputParam);
2875     }
2876 #endif
2877 }
2878 
2879 
2880 
2881 void
collapse_ops()2882 RuntimeOptimizer::collapse_ops ()
2883 {
2884     //
2885     // Make new code that removes all the nops
2886     //
2887     OpcodeVec new_ops;              // buffer for new code
2888     std::vector<int> op_remap;      // mapping of old opcode indices to new
2889     int total_ops = 0;              // number of new ops we'll need
2890 
2891     // First, just count how many we need and set up the mapping
2892     for (auto&& op : inst()->ops()) {
2893         op_remap.push_back (total_ops);
2894         if (op.opname() != u_nop)
2895             ++total_ops;
2896     }
2897 
2898     // Now make a new table of the right (new) size, copy the used ops, and
2899     // reset the jump addresses.
2900     new_ops.reserve (total_ops);
2901     for (auto&& op : inst()->ops()) {
2902         if (op.opname() != u_nop) {
2903             new_ops.push_back (op);
2904             Opcode &newop (new_ops.back());
2905             for (int i = 0;  i < (int)Opcode::max_jumps;  ++i)
2906                 if (newop.jump(i) >= 0)
2907                     newop.jump(i) = op_remap[newop.jump(i)];
2908         }
2909     }
2910 
2911     // Adjust 'main' code range and init op ranges
2912     inst()->m_maincodebegin = op_remap[inst()->m_maincodebegin];
2913     inst()->m_maincodeend = (int)new_ops.size();
2914     FOREACH_PARAM (auto&& s, inst()) {
2915         if (s.has_init_ops()) {
2916             s.initbegin (op_remap[s.initbegin()]);
2917             if (s.initend() < (int)op_remap.size())
2918                 s.initend (op_remap[s.initend()]);
2919             else
2920                 s.initend ((int)new_ops.size());
2921         }
2922     }
2923 
2924     // Swap the new code for the old.
2925     std::swap (inst()->m_instops, new_ops);
2926 
2927     // These are no longer valid
2928     m_bblockids.clear ();
2929     m_in_conditional.clear ();
2930     m_in_loop.clear ();
2931 }
2932 
2933 
2934 
2935 std::ostream &
printinst(std::ostream & out) const2936 RuntimeOptimizer::printinst (std::ostream &out) const
2937 {
2938     out << "Shader " << inst()->shadername() << "\n";
2939     out << (inst()->unused() ? " UNUSED" : "");
2940     out << " connections in=" << inst()->nconnections();
2941     out << " out=" << inst()->outgoing_connections();
2942     out << (inst()->writes_globals() ? " writes_globals" : "");
2943     out << (inst()->userdata_params() ? " userdata_params" : "");
2944     out << (inst()->run_lazily() ? " run_lazily" : " run_unconditionally");
2945     out << (inst()->outgoing_connections() ? " outgoing_connections" : "");
2946     out << (inst()->renderer_outputs() ? " renderer_outputs" : "");
2947     out << (inst()->writes_globals() ? " writes_globals" : "");
2948     out << (inst()->entry_layer() ? " entry_layer" : "");
2949     out << (inst()->last_layer() ? " last_layer" : "");
2950     out << "\n";
2951     out << "  symbols:\n";
2952     for (size_t i = 0, e = inst()->symbols().size();  i < e;  ++i)
2953         inst()->symbol(i)->print (out, 256);
2954 #if 0
2955     out << "  int consts:\n    ";
2956     for (size_t i = 0;  i < inst()->m_iconsts.size();  ++i)
2957         out << inst()->m_iconsts[i] << ' ';
2958     out << "\n";
2959     out << "  float consts:\n    ";
2960     for (size_t i = 0;  i < inst()->m_fconsts.size();  ++i)
2961         out << inst()->m_fconsts[i] << ' ';
2962     out << "\n";
2963     out << "  string consts:\n    ";
2964     for (size_t i = 0;  i < inst()->m_sconsts.size();  ++i)
2965         out << "\"" << Strutil::escape_chars(inst()->m_sconsts[i]) << "\" ";
2966     out << "\n";
2967 #endif
2968     out << "  code:\n";
2969     for (size_t i = 0, e = inst()->ops().size();  i < e;  ++i) {
2970         const Opcode &op (inst()->ops()[i]);
2971         if (i == (size_t)inst()->maincodebegin())
2972             out << "(main)\n";
2973         out << "    " << i << ": " << op.opname();
2974         bool allconst = true;
2975         for (int a = 0;  a < op.nargs();  ++a) {
2976             const Symbol *s (inst()->argsymbol(op.firstarg()+a));
2977             out << " " << s->name();
2978             if (s->symtype() == SymTypeConst) {
2979                 out << " (";
2980                 s->print_vals(out,16);
2981                 out << ")";
2982             }
2983             if (op.argread(a))
2984                 allconst &= s->is_constant();
2985         }
2986         for (size_t j = 0;  j < Opcode::max_jumps;  ++j)
2987             if (op.jump(j) >= 0)
2988                 out << " " << op.jump(j);
2989         out << "\t# ";
2990 //        out << "    rw " << Strutil::sprintf("%x",op.argread_bits())
2991 //            << ' ' << op.argwrite_bits();
2992         if (op.argtakesderivs_all())
2993             out << " %derivs(" << op.argtakesderivs_all() << ") ";
2994         if (allconst)
2995             out << "  CONST";
2996         if (i == 0 || bblockid(i) != bblockid(i-1))
2997             out << "  BBLOCK-START";
2998         std::string filename = op.sourcefile().string();
2999         size_t slash = filename.find_last_of ("/");
3000         if (slash != std::string::npos)
3001             filename.erase (0, slash+1);
3002         if (filename.length())
3003             out << "  (" << filename << ":" << op.sourceline() << ")";
3004         out << "\n";
3005     }
3006     if (inst()->nconnections()) {
3007         out << "  connections upstream:\n";
3008         for (int i = 0, e = inst()->nconnections(); i < e; ++i) {
3009             const Connection &c (inst()->connection(i));
3010             out << "    " << c.dst.type.c_str() << ' '
3011                 << inst()->symbol(c.dst.param)->name();
3012             if (c.dst.arrayindex >= 0)
3013                 out << '[' << c.dst.arrayindex << ']';
3014             out << " upconnected from layer " << c.srclayer << ' ';
3015             const ShaderInstance *up = group()[c.srclayer];
3016             out << "(" << up->layername() << ") ";
3017             out << "    " << c.src.type.c_str() << ' '
3018                 << up->symbol(c.src.param)->name();
3019             if (c.src.arrayindex >= 0)
3020                 out << '[' << c.src.arrayindex << ']';
3021             out << "\n";
3022         }
3023     }
3024     return out;
3025 }
3026 
3027 
3028 
3029 void
run()3030 RuntimeOptimizer::run ()
3031 {
3032     Timer rop_timer;
3033     int nlayers = (int) group().nlayers ();
3034     if (debug())
3035         shadingcontext()->infof("About to optimize shader group %s (%d layers):",
3036                            group().name(), nlayers);
3037     if (debug())
3038         std::cout << "About to optimize shader group " << group().name() << "\n";
3039 
3040     for (int layer = 0;  layer < nlayers;  ++layer) {
3041         set_inst (layer);
3042         // These need to happen before merge_instances
3043         inst()->copy_code_from_master (group());
3044         mark_outgoing_connections();
3045     }
3046 
3047     // Inventory the network and print pre-optimized debug info
3048     size_t old_nsyms = 0, old_nops = 0;
3049     for (int layer = 0;  layer < nlayers;  ++layer) {
3050         set_inst (layer);
3051         if (debug() /* && optimize() >= 1*/) {
3052             find_basic_blocks ();
3053             std::cout.flush ();
3054             std::cout << "Before optimizing layer " << layer << " \""
3055                       << inst()->layername() << "\" (ID " << inst()->id() << ") :\n";
3056             printinst (std::cout);
3057             std::cout << "\n--------------------------------\n" << std::endl;
3058         }
3059         old_nsyms += inst()->symbols().size();
3060         old_nops += inst()->ops().size();
3061     }
3062 
3063     if (shadingsys().m_opt_merge_instances == 1)
3064         shadingsys().merge_instances (group());
3065 
3066     m_params_holding_globals.resize (nlayers);
3067 
3068     // Inventory for error calls so that if lazyerror=0 we don't incorrectly
3069     // assume the layer is unused.
3070     check_for_error_calls(false);
3071 
3072     // Optimize each layer, from first to last
3073     for (int layer = 0;  layer < nlayers;  ++layer) {
3074         set_inst (layer);
3075         if (inst()->unused())
3076             continue;
3077         // N.B. we need to resolve isconnected() calls before the instance
3078         // is otherwise optimized, or else isconnected() may not reflect
3079         // the original connectivity after substitutions are made.
3080         resolve_isconnected ();
3081         optimize_instance ();
3082     }
3083     check_for_error_calls(false);  // re-check
3084 
3085     // Optimize each layer again, from last to first (because some
3086     // optimizations are only apparent when the subsequent shaders have
3087     // been simplified).
3088     for (int layer = nlayers-1;  layer >= 0;  --layer) {
3089         set_inst (layer);
3090         if (! inst()->unused())
3091             optimize_instance ();
3092     }
3093 
3094     // Try merging instances again, now that we've optimized
3095     shadingsys().merge_instances (group(), true);
3096 
3097     for (int layer = nlayers-1;  layer >= 0;  --layer) {
3098         set_inst (layer);
3099         if (inst()->unused())
3100             continue;
3101         find_basic_blocks ();
3102         track_variable_dependencies ();
3103 
3104         // For our parameters that require derivatives, mark their
3105         // upstream connections as also needing derivatives.
3106         for (auto&& c : inst()->m_connections) {
3107             if (inst()->symbol(c.dst.param)->has_derivs()) {
3108                 Symbol *source = group()[c.srclayer]->symbol(c.src.param);
3109                 if (source->typespec().elementtype().is_float_based())
3110                     source->has_derivs (true);
3111             }
3112         }
3113     }
3114 
3115     // Post-opt cleanup: add useparam, coalesce temporaries, etc.
3116     for (int layer = 0;  layer < nlayers;  ++layer) {
3117         set_inst (layer);
3118         post_optimize_instance ();
3119     }
3120 
3121     // Last chance to eliminate duplicate instances
3122     shadingsys().merge_instances (group(), true);
3123 
3124     // Last inventory of error() calls, issue warnings if needed.
3125     check_for_error_calls(true);
3126 
3127     // Get rid of nop instructions and unused symbols.
3128     size_t new_nsyms = 0, new_nops = 0, new_deriv_syms = 0;
3129     for (int layer = 0;  layer < nlayers;  ++layer) {
3130         set_inst (layer);
3131         if (inst()->unused())
3132             continue;  // no need to print or gather stats for unused layers
3133         if (optimize() >= 1) {
3134             collapse_syms ();
3135             collapse_ops ();
3136         }
3137         if (debug() && !inst()->unused()) {
3138             track_variable_lifetimes ();
3139             std::cout << "After optimizing layer " << layer << " \""
3140                       << inst()->layername() << "\" (ID " << inst()->id() << ") :\n";
3141             printinst (std::cout);
3142             std::cout << "\n--------------------------------\n" << std::endl;
3143         }
3144         new_nsyms += inst()->symbols().size();
3145         new_nops += inst()->ops().size();
3146     }
3147 
3148     m_unknown_textures_needed = false;
3149     m_unknown_closures_needed = false;
3150     m_unknown_attributes_needed = false;
3151     m_textures_needed.clear();
3152     m_closures_needed.clear();
3153     m_globals_read = 0;
3154     m_globals_write = 0;
3155     m_globals_needed.clear();
3156     m_userdata_needed.clear();
3157     m_attributes_needed.clear();
3158     bool does_nothing = true;
3159     for (int layer = 0;  layer < nlayers;  ++layer) {
3160         set_inst (layer);
3161         if (inst()->unused())
3162             continue;  // no need to print or gather stats for unused layers
3163         FOREACH_SYM (Symbol &s, inst()) {
3164             // set the layer numbers
3165             s.layer (layer);
3166             // Find interpolated parameters
3167             if ((s.symtype() == SymTypeParam || s.symtype() == SymTypeOutputParam)
3168                 && ! s.lockgeom()) {
3169                 UserDataNeeded udn (s.name(), layer, s.typespec().simpletype(),
3170                                     s.data(), s.has_derivs());
3171                 std::set<UserDataNeeded>::iterator found;
3172                 found = m_userdata_needed.find (udn);
3173                 if (found == m_userdata_needed.end())
3174                     m_userdata_needed.insert (udn);
3175                 else if (udn.derivs && ! found->derivs) {
3176                     m_userdata_needed.erase (found);
3177                     m_userdata_needed.insert (udn);
3178                 }
3179             }
3180             // Track which globals the group needs
3181             if (s.symtype() == SymTypeGlobal) {
3182                 m_globals_needed.insert (s.name());
3183                 int bit = int(ShadingSystem::globals_bit (s.name()));
3184                 if (s.everread())
3185                     m_globals_read |= bit;
3186                 if (s.everwritten())
3187                     m_globals_write |= bit;
3188             }
3189             if (s.has_derivs())
3190                 ++new_deriv_syms;
3191         }
3192         for (auto&& op : inst()->ops()) {
3193             const OpDescriptor *opd = shadingsys().op_descriptor (op.opname());
3194             if (! opd)
3195                 continue;
3196             // a non-unused layer with a nontrivial op does something
3197             if (op.opname() != Strings::end && op.opname() != Strings::useparam)
3198                 does_nothing = false;
3199             // Useparam of a down-connected or renderer output does something
3200             if (op.opname() == Strings::useparam) {
3201                 for (int i = 0, e = op.nargs(); i < e; ++i) {
3202                     Symbol *sym = opargsym (op, i);
3203                     if (sym->connected_down() || sym->renderer_output())
3204                         does_nothing = false;
3205                 }
3206             }
3207             if (opd->flags & OpDescriptor::Tex) {
3208                 // for all the texture ops, arg 1 is the texture name
3209                 Symbol *sym = opargsym (op, 1);
3210                 OSL_DASSERT (sym && sym->typespec().is_string());
3211                 if (sym->is_constant()) {
3212                     ustring texname = *(ustring *)sym->data();
3213                     m_textures_needed.insert (texname);
3214                 } else {
3215                     m_unknown_textures_needed = true;
3216                 }
3217             }
3218             if (op.opname() == u_closure) {
3219                 // It's either 'closure result weight name' or 'closure result name'
3220                 Symbol *sym = opargsym (op, 1); // arg 1 is the closure name
3221                 if (sym && !sym->typespec().is_string())
3222                     sym = opargsym (op, 2);
3223                 OSL_DASSERT (sym && sym->typespec().is_string());
3224                 if (sym->is_constant()) {
3225                     ustring closurename = *(ustring *)sym->data();
3226                     m_closures_needed.insert (closurename);
3227                 } else {
3228                     m_unknown_closures_needed = true;
3229                 }
3230             } else if (op.opname() == u_getattribute) {
3231                 Symbol *sym1 = opargsym (op, 1);
3232                 OSL_DASSERT (sym1 && sym1->typespec().is_string());
3233                 if (sym1->is_constant()) {
3234                     if (op.nargs() == 3) {
3235                         // getattribute( attributename, result )
3236                         m_attributes_needed.insert( AttributeNeeded( *(ustring *)sym1->data() ) );
3237                     } else {
3238                         OSL_DASSERT (op.nargs() == 4 || op.nargs() == 5);
3239                         Symbol *sym2 = opargsym (op, 2);
3240                         if (sym2->typespec().is_string()) {
3241                             // getattribute( scopename, attributename, result ) or
3242                             // getattribute( scopename, attributename, arrayindex, result )
3243                             if (sym2->is_constant()) {
3244                                 m_attributes_needed.insert( AttributeNeeded(
3245                                     *(ustring *)sym2->data(), *(ustring *)sym1->data()
3246                                 ) );
3247                             } else {
3248                                 m_unknown_attributes_needed = true;
3249                             }
3250                         } else {
3251                             // getattribute( attributename, arrayindex, result )
3252                             m_attributes_needed.insert( AttributeNeeded( *(ustring *)sym1->data() ) );
3253                         }
3254                     }
3255                 } else { // sym1 not constant
3256                     m_unknown_attributes_needed = true;
3257                 }
3258             }
3259         }
3260     }
3261     group().does_nothing (does_nothing);
3262 
3263     m_stat_specialization_time = rop_timer();
3264     {
3265         // adjust memory stats
3266         ShadingSystemImpl &ss (shadingsys());
3267         spin_lock lock (ss.m_stat_mutex);
3268         ss.m_stat_preopt_syms += old_nsyms;
3269         ss.m_stat_preopt_ops += old_nops;
3270         ss.m_stat_postopt_syms += new_nsyms;
3271         ss.m_stat_postopt_ops += new_nops;
3272         ss.m_stat_syms_with_derivs += new_deriv_syms;
3273         if (does_nothing)
3274             ss.m_stat_empty_groups += 1;
3275     }
3276     if (shadingsys().m_compile_report) {
3277         shadingcontext()->infof("Optimized shader group %s:", group().name());
3278         shadingcontext()->infof(" spec %1.2fs, New syms %llu/%llu (%5.1f%%), ops %llu/%llu (%5.1f%%)",
3279               m_stat_specialization_time, new_nsyms, old_nsyms,
3280               100.0*double((long long)new_nsyms-(long long)old_nsyms)/double(old_nsyms),
3281               new_nops, old_nops,
3282               100.0*double((long long)new_nops-(long long)old_nops)/double(old_nops));
3283         if (does_nothing)
3284             shadingcontext()->infof("Group does nothing");
3285         if (m_textures_needed.size()) {
3286             shadingcontext()->infof("Group needs textures:");
3287             for (auto&& f : m_textures_needed)
3288                 shadingcontext()->infof("    %s", f);
3289             if (m_unknown_textures_needed)
3290                 shadingcontext()->infof("    Also may construct texture names on the fly.");
3291         }
3292         if (m_userdata_needed.size()) {
3293             shadingcontext()->infof("Group potentially needs userdata:");
3294             for (auto&& f : m_userdata_needed)
3295                 shadingcontext()->infof("    %s %s %s", f.name, f.type,
3296                                         f.derivs ? "(derivs)" : "");
3297         }
3298         if (m_attributes_needed.size()) {
3299             shadingcontext()->infof("Group needs attributes:");
3300             for (auto&& f : m_attributes_needed)
3301                 shadingcontext()->infof("    %s %s", f.name, f.scope);
3302             if (m_unknown_attributes_needed)
3303                 shadingcontext()->infof("    Also may construct attribute names on the fly.");
3304         }
3305     }
3306 }
3307 
3308 
3309 
3310 bool
police(const Opcode & op,string_view msg,int type)3311 RuntimeOptimizer::police(const Opcode& op, string_view msg, int type)
3312 {
3313     if ((type & police_gpu_err) && shadingsys().m_gpu_opt_error) {
3314         shadingcontext()->errorf("Optimization error for GPUs:\n"
3315                                  "  group:  %s\n"
3316                                  "  layer:  %s\n"
3317                                  "  source: %s:%d\n"
3318                                  "  issue:  %s",
3319                                  group().name(), inst()->layername(),
3320                                  op.sourcefile(), op.sourceline(), msg);
3321         return true;
3322     } else if ((type & police_opt_warn) && shadingsys().m_opt_warnings) {
3323         shadingcontext()->warningf("Optimization warning:\n"
3324                                    "  group:  %s\n"
3325                                    "  layer:  %s\n"
3326                                    "  source: %s:%d\n"
3327                                    "  issue:  %s",
3328                                    group().name(), inst()->layername(),
3329                                    op.sourcefile(), op.sourceline(), msg);
3330     }
3331     return false;
3332 }
3333 
3334 
3335 
3336 bool
check_for_error_calls(bool warn)3337 RuntimeOptimizer::check_for_error_calls(bool warn)
3338 {
3339     // If the "lazyerror" option is set, there's nothing to do.
3340     if (shadingsys().m_lazyerror)
3341         return false;
3342 
3343     // Check all the layers (even ones we think are unused) for `error()`
3344     // calls that still remain after runtime optimization. If found, warn
3345     // and mark the layer as having error calls.
3346     bool err = false;
3347     int nlayers = (int) group().nlayers ();
3348     for (int layer = 0;  layer < nlayers;  ++layer) {
3349         set_inst (layer);
3350         inst()->has_error_op(false);
3351         for (auto&& op : inst()->ops()) {
3352             if (op.opname() == Strings::error) {
3353                 inst()->has_error_op(true);
3354                 if (warn)
3355                     err |= police (op, "error() call present in optimized shader.",
3356                                    police_opt_warn);
3357             }
3358         }
3359     }
3360     return err;
3361 }
3362 
3363 
3364 
3365 bool
police_failed_optimizations()3366 RuntimeOptimizer::police_failed_optimizations()
3367 {
3368     bool err = false;
3369     bool do_warn = shadingsys().m_opt_warnings;
3370     bool do_gpu_err = shadingsys().m_gpu_opt_error;
3371     if (!do_warn && !do_gpu_err)
3372         return false;  // no need for any of this expense
3373 
3374     int nlayers = (int) group().nlayers ();
3375     for (int layer = 0;  layer < nlayers;  ++layer) {
3376         set_inst (layer);
3377         if (inst()->unused())
3378             continue;  // no need to print or gather stats for unused layers
3379         for (auto&& op : inst()->ops()) {
3380             const OpDescriptor *opd = shadingsys().op_descriptor (op.opname());
3381             if (! opd)
3382                 continue;
3383             if (opd->flags & OpDescriptor::Tex) {
3384                 Symbol *sym = opargsym (op, 1);  // arg 1 is texture name
3385                 OSL_DASSERT(sym && sym->typespec().is_string());
3386                 if (! sym->is_constant()) {
3387                     err |= police (op, OIIO::Strutil::sprintf("%s(): texture name cannot be reduced to a constant.",
3388                                               op.opname()),
3389                                    police_gpu_err);
3390                 }
3391             }
3392             // FIXME: Will add more tests and warnings as we go
3393         }
3394     }
3395     return err;
3396 }
3397 
3398 }; // namespace pvt
3399 OSL_NAMESPACE_EXIT
3400