1 // @HEADER
2 // ***********************************************************************
3 //
4 //                    Teuchos: Common Tools Package
5 //                 Copyright (2004) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #include "Teuchos_TimeMonitor.hpp"
43 #include "Teuchos_CommHelpers.hpp"
44 #include "Teuchos_DefaultComm.hpp"
45 #include "Teuchos_TableColumn.hpp"
46 #include "Teuchos_TableFormat.hpp"
47 #include "Teuchos_StandardParameterEntryValidators.hpp"
48 #include "Teuchos_ScalarTraits.hpp"
49 #include "Teuchos_StackedTimer.hpp"
50 
51 #include <functional>
52 #include <iomanip>
53 #ifdef HAVE_TEUCHOS_ADD_TIME_MONITOR_TO_STACKED_TIMER
54 #include <sstream>
55 #endif
56 
57 namespace Teuchos {
58   /**
59    * \class MaxLoc
60    * \brief Teuchos version of MPI_MAXLOC.
61    * \author Mark Hoemmen
62    *
63    * \tparam Ordinal The template parameter of \c Comm.
64    * \tparam ScalarType Type for which to find the maximum.
65    * \tparam IndexType Type indicating the index of the maximum.
66    *
67    * \c MPI_MAXLOC is a standard reduction operator provided by the
68    * MPI standard.  According to the standard, \c MPI_MAXLOC combines
69    * the (value, index) pairs (u,i) and (v,j) into (w,j), where \f$w =
70    * max(u,v)\f$, and
71    * \f[
72    *   k = \begin{cases}
73    *     i         & \text{if $u > v$}, \\
74    *     \min(i,j) & \text{if $u = v$}, \\
75    *     j         & \text{if $u < v$}. \\
76    * \end{cases}
77    * \f]
78 
79    * This class implements the \c MPI_MAXLOC reduction operator for
80    * the Teuchos communication wrappers.  We need to define this
81    * separately from MPI for two reasons.  First, the Teuchos comm
82    * wrappers do not currently expose taking an MPI_Op.  Second, we
83    * need MaxLoc to be templated on the scalar and index types, in
84    * order to match Teuchos' MPI interface.
85    *
86    * What happens to NaN ("Not a Number")?  A NaN is neither less
87    * than, greater than, or equal to any floating-point number or any
88    * NaN.  We can alter the above definition slightly so that a \c
89    * MaxLoc reduction has a well-defined result in case the array
90    * contains a NaN:
91    * \f[
92    *   w = \begin{cases}
93    *     u     & \text{if $u > v$}, \\
94    *     v     & \text{if $u < v$}. \\
95    *     u     & \text{otherwise}. \\
96    *   \end{cases}
97    * \f]
98    * and
99    * \f[
100    *   k = \begin{cases}
101    *     i         & \text{if $u > v$}, \\
102    *     j         & \text{if $u < v$}. \\
103    *     \min(i,j) & \text{otherwise}. \\
104    *   \end{cases}
105    * \f]
106    * Defining \c MaxLoc in this way ensures that for any array
107    * containing a NaN, the value (w) returned is the first NaN, and
108    * the index (k) returned is the index of the first NaN.
109    */
110   template<class Ordinal, class ScalarType, class IndexType>
111   class MaxLoc :
112     public ValueTypeReductionOp<Ordinal, std::pair<ScalarType, IndexType> > {
113   public:
114     void
115     reduce (const Ordinal count,
116             const std::pair<ScalarType, IndexType> inBuffer[],
117             std::pair<ScalarType, IndexType> inoutBuffer[]) const;
118   };
119 
120   template<class Ordinal>
121   class MaxLoc<Ordinal, double, int> :
122     public ValueTypeReductionOp<Ordinal, std::pair<double, int> > {
123   public:
124     void
reduce(const Ordinal count,const std::pair<double,int> inBuffer[],std::pair<double,int> inoutBuffer[]) const125     reduce (const Ordinal count,
126             const std::pair<double, int> inBuffer[],
127             std::pair<double, int> inoutBuffer[]) const
128     {
129       for (Ordinal ind = 0; ind < count; ++ind) {
130         const std::pair<double, int>& in = inBuffer[ind];
131         std::pair<double, int>& inout = inoutBuffer[ind];
132 
133         if (in.first > inout.first) {
134           inout.first = in.first;
135           inout.second = in.second;
136         } else if (in.first < inout.first) {
137           // Don't need to do anything; inout has the values.
138         } else { // equal, or at least one is NaN.
139           inout.first = in.first;
140           inout.second = std::min (in.second, inout.second);
141         }
142       }
143     }
144   };
145 
146   /** \class MinLoc
147    * \brief Teuchos version of MPI_MINLOC.
148    * \author Mark Hoemmen
149    *
150    * \tparam Ordinal The template parameter of \c Comm.
151    * \tparam ScalarType Type for which to find the minimum.
152    * \tparam IndexType Type indicating the index of the minimum.
153    *
154    * \c MPI_MINLOC is a standard reduction operator provided by the
155    * MPI standard.  According to the standard, \c MPI_MINLOC combines
156    * the (value, index) pairs (u,i) and (v,j) into (w,j), where \f$w =
157    * min(u,v)\f$, and
158    * \f[
159    *   k = \begin{cases}
160    *     i         & \text{if $u < v$}, \\
161    *     \min(i,j) & \text{if $u = v$}, \\
162    *     j         & \text{if $u > v$}. \\
163    *   \end{cases}
164    * \f]
165    * This class implements the \c MPI_MINLOC reduction operator for
166    * the Teuchos communication wrappers.
167    *
168    * Refer to the note in the documentation of \c MaxLoc that
169    * explains how we adjust the above definition to produce
170    * well-defined results even if the array contains a NaN.
171    */
172   template<class Ordinal, class ScalarType, class IndexType>
173   class MinLoc :
174     public ValueTypeReductionOp<Ordinal, std::pair<ScalarType, IndexType> > {
175   public:
176     void
177     reduce (const Ordinal count,
178             const std::pair<ScalarType, IndexType> inBuffer[],
179             std::pair<ScalarType, IndexType> inoutBuffer[]) const;
180   };
181 
182   template<class Ordinal>
183   class MinLoc<Ordinal, double, int> :
184     public ValueTypeReductionOp<Ordinal, std::pair<double, int> > {
185   public:
186     void
reduce(const Ordinal count,const std::pair<double,int> inBuffer[],std::pair<double,int> inoutBuffer[]) const187     reduce (const Ordinal count,
188             const std::pair<double, int> inBuffer[],
189             std::pair<double, int> inoutBuffer[]) const
190     {
191       for (Ordinal ind = 0; ind < count; ++ind) {
192         const std::pair<double, int>& in = inBuffer[ind];
193         std::pair<double, int>& inout = inoutBuffer[ind];
194 
195         if (in.first < inout.first) {
196           inout.first = in.first;
197           inout.second = in.second;
198         } else if (in.first > inout.first) {
199           // Don't need to do anything; inout has the values.
200         } else { // equal, or at least one is NaN.
201           inout.first = in.first;
202           inout.second = std::min (in.second, inout.second);
203         }
204       }
205     }
206   };
207 
208   /** \class MinLocNonzero
209    * \brief same as MinLoc, but don't allow zero
210   */
211   template<class Ordinal, class ScalarType, class IndexType>
212   class MinLocNonzero :
213     public ValueTypeReductionOp<Ordinal, std::pair<ScalarType, IndexType> > {
214   public:
215     void
216     reduce (const Ordinal count,
217             const std::pair<ScalarType, IndexType> inBuffer[],
218             std::pair<ScalarType, IndexType> inoutBuffer[]) const;
219   };
220 
221   template<class Ordinal>
222   class MinLocNonzero<Ordinal, double, int> :
223     public ValueTypeReductionOp<Ordinal, std::pair<double, int> > {
224   public:
225     void
reduce(const Ordinal count,const std::pair<double,int> inBuffer[],std::pair<double,int> inoutBuffer[]) const226     reduce (const Ordinal count,
227             const std::pair<double, int> inBuffer[],
228             std::pair<double, int> inoutBuffer[]) const
229     {
230       for (Ordinal ind = 0; ind < count; ++ind) {
231         const std::pair<double, int>& in = inBuffer[ind];
232         std::pair<double, int>& inout = inoutBuffer[ind];
233 
234         if ( (in.first < inout.first && in.first != 0) || (inout.first == 0 && in.first != 0) ) {
235           inout.first = in.first;
236           inout.second = in.second;
237         } else if (in.first > inout.first) {
238           // Don't need to do anything; inout has the values.
239         } else { // equal, or at least one is NaN.
240           inout.first = in.first;
241           inout.second = std::min (in.second, inout.second);
242         }
243       }
244     }
245   };
246 
247   // Typedef used internally by TimeMonitor::summarize() and its
248   // helper functions.  The map is keyed on timer label (a string).
249   // Each value is a pair: (total number of seconds over all calls to
250   // that timer, total number of calls to that timer).
251   typedef std::map<std::string, std::pair<double, int> > timer_map_t;
252 
253   // static initialization
254   Teuchos::RCP<Teuchos::StackedTimer> TimeMonitor::stackedTimer_ = Teuchos::rcp(new Teuchos::StackedTimer("Teuchos::StackedTimer"));
255 
TimeMonitor(Time & timer,bool reset)256   TimeMonitor::TimeMonitor (Time& timer, bool reset)
257     : PerformanceMonitorBase<Time>(timer, reset)
258   {
259     if (!isRecursiveCall()) {
260       counter().start(reset);
261 #ifdef HAVE_TEUCHOS_ADD_TIME_MONITOR_TO_STACKED_TIMER
262       if (nonnull(stackedTimer_))
263         stackedTimer_->start(counter().name(),false);
264 #endif
265     }
266   }
267 
~TimeMonitor()268   TimeMonitor::~TimeMonitor() {
269     if (!isRecursiveCall()) {
270       counter().stop();
271 #ifdef HAVE_TEUCHOS_ADD_TIME_MONITOR_TO_STACKED_TIMER
272       try {
273         if (nonnull(stackedTimer_))
274           stackedTimer_->stop(counter().name(),false);
275       }
276       catch (std::runtime_error&) {
277         std::ostringstream warning;
278         warning <<
279           "\n*********************************************************************\n"
280             "WARNING: Overlapping timers detected! Near: " <<counter().name()<<"\n"
281           "A TimeMonitor timer was stopped before a nested subtimer was\n"
282           "stopped. This is not allowed by the StackedTimer. This corner case\n"
283           "typically occurs if the TimeMonitor is stored in an RCP and the RCP is\n"
284           "assigned to a new timer. To disable this warning, either fix the\n"
285           "ordering of timer creation and destuction or disable the StackedTimer\n"
286           "support in the TimeMonitor by setting the StackedTimer to null\n"
287           "Example:\n"
288           " RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(\"Junk\"))));\n"
289           "///code to time \n"
290           "MM = Teuchos::null;\n"
291           "MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(\"SecondJunk\"))));\n"
292           "*********************************************************************\n";
293         std::cout << warning.str() << std::endl;
294         Teuchos::TimeMonitor::setStackedTimer(Teuchos::null);
295       }
296 #endif
297     }
298   }
299 
300   void
disableTimer(const std::string & name)301   TimeMonitor::disableTimer (const std::string& name)
302   {
303     RCP<Time> timer = lookupCounter (name);
304     TEUCHOS_TEST_FOR_EXCEPTION(
305       timer == null, std::invalid_argument,
306       "TimeMonitor::disableTimer: Invalid timer \"" << name << "\"");
307     timer->disable ();
308   }
309 
310   void
enableTimer(const std::string & name)311   TimeMonitor::enableTimer (const std::string& name)
312   {
313     RCP<Time> timer = lookupCounter (name);
314     TEUCHOS_TEST_FOR_EXCEPTION(
315       timer == null, std::invalid_argument,
316       "TimeMonitor::enableTimer: Invalid timer \"" << name << "\"");
317     timer->enable ();
318   }
319 
320   void
zeroOutTimers()321   TimeMonitor::zeroOutTimers()
322   {
323     typedef std::map<std::string, RCP<Time> > map_type;
324     typedef map_type::iterator iter_type;
325     map_type& ctrs = counters ();
326 
327     // In debug mode, loop first to check whether any of the timers
328     // are running, before resetting them.  This ensures that this
329     // method satisfies the strong exception guarantee (either it
330     // completes normally, or there are no side effects).
331 #ifdef TEUCHOS_DEBUG
332     for (iter_type it = ctrs.begin(); it != ctrs.end(); ++it) {
333       // We throw a runtime_error rather than a logic_error, because
334       // logic_error suggests a bug in the implementation of
335       // TimeMonitor.  Calling zeroOutTimers() when a timer is running
336       // is not TimeMonitor's fault.
337       TEUCHOS_TEST_FOR_EXCEPTION(
338         it->second->isRunning (), std::runtime_error,
339         "Timer \"" << it->second->name () << "\" is currently running.  "
340         "You are not allowed to reset running timers.");
341     }
342 #endif // TEUCHOS_DEBUG
343 
344     for (iter_type it = ctrs.begin(); it != ctrs.end(); ++it) {
345       it->second->reset ();
346     }
347   }
348 
349   // An anonymous namespace is the standard way of limiting linkage of
350   // its contained routines to file scope.
351   namespace {
352     // \brief Return an "empty" local timer datum.
353     //
354     // "Empty" means the datum has zero elapsed time and zero call
355     // count.  This function does not actually create a timer.
356     //
357     // \param name The timer's name.
358     std::pair<std::string, std::pair<double, int> >
makeEmptyTimerDatum(const std::string & name)359     makeEmptyTimerDatum (const std::string& name)
360     {
361       return std::make_pair (name, std::make_pair (double(0), int(0)));
362     }
363 
364     // \fn collectLocalTimerData
365     // \brief Collect and sort local timer data by timer names.
366     //
367     // \param localData [out] Map whose keys are the timer names, and
368     //   whose value for each key is the total elapsed time (in
369     //   seconds) and the call count for the timer with that name.
370     //
371     // \param localCounters [in] Timers from which to extract data.
372     //
373     // \param filter [in] Filter for timer labels.  If filter is not
374     //   empty, this method will only collect data for local timers
375     //   whose labels begin with this string.
376     //
377     // Extract the total elapsed time and call count from each timer
378     // in the given array.  Merge results for timers with duplicate
379     // labels, by summing their total elapsed times and call counts
380     // pairwise.
381     void
collectLocalTimerData(timer_map_t & localData,const std::map<std::string,RCP<Time>> & localCounters,const std::string & filter="")382     collectLocalTimerData (timer_map_t& localData,
383                            const std::map<std::string, RCP<Time> >& localCounters,
384                            const std::string& filter="")
385     {
386       using std::make_pair;
387       typedef timer_map_t::iterator iter_t;
388 
389       timer_map_t theLocalData;
390       for (std::map<std::string, RCP<Time> >::const_iterator it = localCounters.begin();
391            it != localCounters.end(); ++it) {
392         const std::string& name = it->second->name ();
393 
394         // Filter current timer name, if provided filter is nonempty.
395         // Filter string must _start_ the timer label, not just be in it.
396         const bool skipThisOne = (filter != "" && name.find (filter) != 0);
397         if (! skipThisOne) {
398           const double timing = it->second->totalElapsedTime ();
399           const int numCalls = it->second->numCalls ();
400 
401           // Merge timers with duplicate labels, by summing their
402           // total elapsed times and call counts.
403           iter_t loc = theLocalData.find (name);
404           if (loc == theLocalData.end()) {
405             // Use loc as an insertion location hint.
406             theLocalData.insert (loc, make_pair (name, make_pair (timing, numCalls)));
407           }
408           else {
409             loc->second.first += timing;
410             loc->second.second += numCalls;
411           }
412         }
413       }
414       // This avoids copying the map, and also makes this method
415       // satisfy the strong exception guarantee.
416       localData.swap (theLocalData);
417     }
418 
419     // \brief Locally filter out timer data with zero call counts.
420     //
421     // \param timerData [in/out]
422     void
filterZeroData(timer_map_t & timerData)423     filterZeroData (timer_map_t& timerData)
424     {
425       // FIXME (mfh 15 Mar 2013) Should use std::map::erase with
426       // iterator hint, instead of rebuilding the map completely.
427       timer_map_t newTimerData;
428       for (timer_map_t::const_iterator it = timerData.begin();
429            it != timerData.end(); ++it) {
430         if (it->second.second > 0) {
431           newTimerData[it->first] = it->second;
432         }
433       }
434       timerData.swap (newTimerData);
435     }
436 
437     /// \fn collectLocalTimerDataAndNames
438     /// \brief Collect the local timer data and timer names from the timers.
439     ///
440     /// \param localTimerData [out] Timer data extracted from
441     ///   localTimers.  Contents on input are ignored and overwritten.
442     ///   See the documentation of \c collectLocalTimerData().
443     ///
444     /// \param localTimerNames [out] On output: names of timers
445     ///   extracted from \c localTimers, in the same order as the keys
446     ///   of \c localTimerData.  Resized as necessary.  Contents on
447     ///   input are ignored and overwritten.
448     ///
449     /// \param localTimers [in] (Local) timers from which to extract data.
450     ///
451     /// \param writeZeroTimers [in] If true, do not include timers
452     ///   with zero call counts in the \c localTimerData and \c
453     ///   localTimerNames output.
454     ///
455     /// \param filter [in] Filter for timer labels.  If filter is not
456     ///   empty, this method will only collect data for local timers
457     ///   whose labels begin with this string.
458     void
collectLocalTimerDataAndNames(timer_map_t & localTimerData,Array<std::string> & localTimerNames,const std::map<std::string,RCP<Time>> & localTimers,const bool writeZeroTimers,const std::string & filter="")459     collectLocalTimerDataAndNames (timer_map_t& localTimerData,
460                                    Array<std::string>& localTimerNames,
461                                    const std::map<std::string, RCP<Time> >& localTimers,
462                                    const bool writeZeroTimers,
463                                    const std::string& filter="")
464     {
465       // Collect and sort local timer data by timer names.
466       collectLocalTimerData (localTimerData, localTimers, filter);
467 
468       // Filter out zero data locally first.  This ensures that if we
469       // are writing global stats, and if a timer name exists in the
470       // set of global names, then that timer has a nonzero call count
471       // on at least one MPI process.
472       if (! writeZeroTimers) {
473         filterZeroData (localTimerData);
474       }
475 
476       // Extract the set of local timer names.  The std::map keeps
477       // them sorted alphabetically.
478       localTimerNames.reserve (localTimerData.size());
479       for (timer_map_t::const_iterator it = localTimerData.begin();
480            it != localTimerData.end(); ++it) {
481         localTimerNames.push_back (it->first);
482       }
483     }
484 
485     /// \brief Merge local timer data into global data.
486     ///
487     /// Call this method in \c summarize() only if the \c
488     /// writeGlobalStats argument is true.
489     ///
490     /// \param globalTimerData [out] Result of merging localTimerData
491     ///   over the processes in the given communicator.
492     ///
493     /// \param globalTimerNames [out] Names of the timers in
494     ///   globalTimerData; same as the keys of the map.
495     ///
496     /// \param localTimerData [in/out] On input: the first return
497     ///   value of \c collectLocalTimerDataAndNames().  On output, if
498     ///   writeZeroTimers is true, data for timers with zero call
499     ///   counts will be removed.
500     ///
501     /// \param localTimerNames [in/out] On input: the second return
502     ///   value of \c collectLocalTimerDataAndNames().  On output, if
503     ///   writeZeroTimers is true, names of timers with zero call
504     ///   counts will be removed.
505     ///
506     /// \param comm [in] Communicator over which to merge.
507     ///
508     /// \param alwaysWriteLocal [in] If true, and if the local set of
509     ///   timers differs from the global set of timers (either the
510     ///   union or the intersection, depending on \c setOp), Proc 0
511     ///   will create corresponding timer data in localTimerData with
512     ///   zero elapsed times and call counts.  (This pads the output
513     ///   so it fits in a tabular form.)
514     ///
515     /// \param setOp [in] If \c Intersection, compute the intersection
516     ///   of all created timers over all processes in the
517     ///   communicator.  If \c Union, compute the union of all created
518     ///   timers over all processes in the communicator.
519     void
collectGlobalTimerData(timer_map_t & globalTimerData,Array<std::string> & globalTimerNames,timer_map_t & localTimerData,Array<std::string> & localTimerNames,Ptr<const Comm<int>> comm,const bool alwaysWriteLocal,const ECounterSetOp setOp)520     collectGlobalTimerData (timer_map_t& globalTimerData,
521                             Array<std::string>& globalTimerNames,
522                             timer_map_t& localTimerData,
523                             Array<std::string>& localTimerNames,
524                             Ptr<const Comm<int> > comm,
525                             const bool alwaysWriteLocal,
526                             const ECounterSetOp setOp)
527     {
528       // There may be some global timers that are not local timers on
529       // the calling MPI process(es).  In that case, if
530       // alwaysWriteLocal is true, then we need to fill in the
531       // "missing" local timers.  That will ensure that both global
532       // and local timer columns in the output table have the same
533       // number of rows.  The collectLocalTimerDataAndNames() method
534       // may have already filtered out local timers with zero call
535       // counts (if its writeZeroTimers argument was false), but we
536       // won't be filtering again.  Thus, any local timer data we
537       // insert here won't get filtered out.
538       //
539       // Note that calling summarize() with writeZeroTimers == false
540       // will still do what it says, even if we insert local timers
541       // with zero call counts here.
542 
543       // This does the correct and inexpensive thing (just copies the
544       // timer data) if numProcs == 1.  Otherwise, it initiates a
545       // communication with \f$O(\log P)\f$ messages along the
546       // critical path, where \f$P\f$ is the number of participating
547       // processes.
548       mergeCounterNames (*comm, localTimerNames, globalTimerNames, setOp);
549 
550 #ifdef TEUCHOS_DEBUG
551       {
552         // Sanity check that all processes have the name number of
553         // global timer names.
554         const timer_map_t::size_type myNumGlobalNames = globalTimerNames.size();
555         timer_map_t::size_type minNumGlobalNames = 0;
556         timer_map_t::size_type maxNumGlobalNames = 0;
557         reduceAll (*comm, REDUCE_MIN, myNumGlobalNames,
558                    outArg (minNumGlobalNames));
559         reduceAll (*comm, REDUCE_MAX, myNumGlobalNames,
560                    outArg (maxNumGlobalNames));
561         TEUCHOS_TEST_FOR_EXCEPTION(minNumGlobalNames != maxNumGlobalNames,
562           std::logic_error, "Min # global timer names = " << minNumGlobalNames
563           << " != max # global timer names = " << maxNumGlobalNames
564           << ".  Please report this bug to the Teuchos developers.");
565         TEUCHOS_TEST_FOR_EXCEPTION(myNumGlobalNames != minNumGlobalNames,
566           std::logic_error, "My # global timer names = " << myNumGlobalNames
567           << " != min # global timer names = " << minNumGlobalNames
568           << ".  Please report this bug to the Teuchos developers.");
569       }
570 #endif // TEUCHOS_DEBUG
571 
572       // mergeCounterNames() just merges the counters' names, not
573       // their actual data.  Now we need to fill globalTimerData with
574       // this process' timer data for the timers in globalTimerNames.
575       //
576       // All processes need the full list of global timers, since
577       // there may be some global timers that are not local timers.
578       // That's why mergeCounterNames() has to be an all-reduce, not
579       // just a reduction to Proc 0.
580       //
581       // Insertion optimization: if the iterator given to map::insert
582       // points right before where we want to insert, insertion is
583       // O(1).  globalTimerNames is sorted, so feeding the iterator
584       // output of map::insert into the next invocation's input should
585       // make the whole insertion O(N) where N is the number of
586       // entries in globalTimerNames.
587       timer_map_t::iterator globalMapIter = globalTimerData.begin();
588       timer_map_t::iterator localMapIter;
589       for (Array<string>::const_iterator it = globalTimerNames.begin();
590            it != globalTimerNames.end(); ++it) {
591         const std::string& globalName = *it;
592         localMapIter = localTimerData.find (globalName);
593 
594         if (localMapIter == localTimerData.end()) {
595           if (alwaysWriteLocal) {
596             // If there are some global timers that are not local
597             // timers, and if we want to print local timers, we insert
598             // a local timer datum with zero elapsed time and zero
599             // call count into localTimerData as well.  This will
600             // ensure that both global and local timer columns in the
601             // output table have the same number of rows.
602             //
603             // We really only need to do this on Proc 0, which is the
604             // only process that currently may print local timers.
605             // However, we do it on all processes, just in case
606             // someone later wants to modify this function to print
607             // out local timer data for some process other than Proc
608             // 0.  This extra computation won't affect the cost along
609             // the critical path, for future computations in which
610             // Proc 0 participates.
611             localMapIter = localTimerData.insert (localMapIter, makeEmptyTimerDatum (globalName));
612 
613             // Make sure the missing global name gets added to the
614             // list of local names.  We'll re-sort the list of local
615             // names below.
616             localTimerNames.push_back (globalName);
617           }
618           // There's a global timer that's not a local timer.  Add it
619           // to our pre-merge version of the global timer data so that
620           // we can safely merge the global timer data later.
621           globalMapIter = globalTimerData.insert (globalMapIter, makeEmptyTimerDatum (globalName));
622         }
623         else {
624           // We have this global timer name in our local timer list.
625           // Fill in our pre-merge version of the global timer data
626           // with our local data.
627           globalMapIter = globalTimerData.insert (globalMapIter, std::make_pair (globalName, localMapIter->second));
628         }
629       }
630 
631       if (alwaysWriteLocal) {
632         // Re-sort the list of local timer names, since we may have
633         // inserted "missing" names above.
634         std::sort (localTimerNames.begin(), localTimerNames.end());
635       }
636 
637 #ifdef TEUCHOS_DEBUG
638       {
639         // Sanity check that all processes have the name number of
640         // global timers.
641         const timer_map_t::size_type myNumGlobalTimers = globalTimerData.size();
642         timer_map_t::size_type minNumGlobalTimers = 0;
643         timer_map_t::size_type maxNumGlobalTimers = 0;
644         reduceAll (*comm, REDUCE_MIN, myNumGlobalTimers,
645                    outArg (minNumGlobalTimers));
646         reduceAll (*comm, REDUCE_MAX, myNumGlobalTimers,
647                    outArg (maxNumGlobalTimers));
648         TEUCHOS_TEST_FOR_EXCEPTION(minNumGlobalTimers != maxNumGlobalTimers,
649                                    std::logic_error, "Min # global timers = " << minNumGlobalTimers
650                                    << " != max # global timers = " << maxNumGlobalTimers
651                                    << ".  Please report this bug to the Teuchos developers.");
652         TEUCHOS_TEST_FOR_EXCEPTION(myNumGlobalTimers != minNumGlobalTimers,
653                                    std::logic_error, "My # global timers = " << myNumGlobalTimers
654                                    << " != min # global timers = " << minNumGlobalTimers
655                                    << ".  Please report this bug to the Teuchos developers.");
656       }
657 #endif // TEUCHOS_DEBUG
658     }
659 
660     /// \brief Compute global timer statistics.
661     ///
662     /// Currently, this function computes the "MinOverProcs",
663     /// "MeanOverProcs", "MaxOverProcs", and "MeanOverCallCounts"
664     /// timings for each timer.  Along with the min / max timing comes
665     /// the call count of the process who had the min / max.  (If more
666     /// than one process had the min / max, then the call count on the
667     /// process with the smallest rank is reported.)
668     ///
669     /// "MeanOverProcs" is the arithmetic mean of the cumulative
670     /// timings over processes.  It ignores the call counts.  It
671     /// includes the mean call count, which may be fractional, and has
672     /// no particular connection to the mean timing.
673     ///
674     /// The "MeanOverCallCounts" is an arithmetic mean of timings that
675     /// accounts for call counts.  Each timing is the sum over all
676     /// calls.  Thus, this mean equals the sum of the timing over all
677     /// processes, divided by the sum of the call counts over all
678     /// processes for that timing.  (We compute it a bit differently
679     /// to help prevent overflow.)  Along with the mean timing comes
680     /// the same mean call count as mentioned above.
681     ///
682     /// Statistics may optionally be calculated in a mode that ignores
683     /// contributions from processes that either do not have a timer
684     /// or have a hard zero for that time.  This is slightly more expensive
685     /// than the default mode, as it requires one additional all-reduce.
686     ///
687     /// \param[out] statData On output: Global timer statistics.  See
688     ///   the \c stat_map_type typedef documentation for an explanation
689     ///   of the data structure.
690     ///
691     /// \param[out] statNames On output: Each value in the statData
692     ///   map is a vector.  That vector v has the same number of
693     ///   entries as statNames.  statNames[k] is the name of the
694     ///   statistic (e.g., "Min", "MeanOverProcs", "Max", or
695     ///   "MeanOverCallCounts") stored as v[k].
696     ///
697     /// \param[in] comm Communicator over which to compute statistics.
698     ///
699     /// \param[in] globalTimerData Output with the same name of the
700     ///   \c collectGlobalTimerData() function.  That function assures
701     ///   that all processes have the same keys stored in this map.
702     ///
703     /// \param[in] ignoreZeroTimers [in] Processes that either do not have
704     ///   a particular timer or have zero time for a timer are not used
705     ///   in calculating global statistics.
706     void
computeGlobalTimerStats(stat_map_type & statData,std::vector<std::string> & statNames,Ptr<const Comm<int>> comm,const timer_map_t & globalTimerData,const bool ignoreZeroTimers)707     computeGlobalTimerStats (stat_map_type& statData,
708                              std::vector<std::string>& statNames,
709                              Ptr<const Comm<int> > comm,
710                              const timer_map_t& globalTimerData,
711                              const bool ignoreZeroTimers)
712     {
713       using Teuchos::ScalarTraits;
714 
715       const int numTimers = static_cast<int> (globalTimerData.size());
716       const int numProcs = comm->getSize();
717 
718       // Extract pre-reduction timings and call counts into a
719       // sequential array.  This array will be in the same order as
720       // the global timer names are in the map.
721       Array<std::pair<double, int> > timingsAndCallCounts;
722       timingsAndCallCounts.reserve (numTimers);
723       for (timer_map_t::const_iterator it = globalTimerData.begin();
724            it != globalTimerData.end(); ++it) {
725         timingsAndCallCounts.push_back (it->second);
726       }
727 
728       // For each timer name, compute the min timing and its
729       // corresponding call count.  If two processes have the same
730       // timing but different call counts, the minimum call count will
731       // be used.
732       Array<std::pair<double, int> > minTimingsAndCallCounts (numTimers);
733       if (numTimers > 0) {
734         if (ignoreZeroTimers)
735           reduceAll (*comm, MinLocNonzero<int, double, int>(), numTimers,
736                      &timingsAndCallCounts[0], &minTimingsAndCallCounts[0]);
737         else
738           reduceAll (*comm, MinLoc<int, double, int>(), numTimers,
739                      &timingsAndCallCounts[0], &minTimingsAndCallCounts[0]);
740       }
741 
742       // For each timer name, compute the max timing and its
743       // corresponding call count.  If two processes have the same
744       // timing but different call counts, the minimum call count will
745       // be used.
746       Array<std::pair<double, int> > maxTimingsAndCallCounts (numTimers);
747       if (numTimers > 0) {
748         reduceAll (*comm, MaxLoc<int, double, int>(), numTimers,
749                    &timingsAndCallCounts[0], &maxTimingsAndCallCounts[0]);
750       }
751 
752       // For each timer name, compute the mean-over-processes timing,
753       // the mean call count, and the mean-over-call-counts timing.
754       // The mean call count is reported as a double to allow a
755       // fractional value.
756       //
757       // Each local timing is really the total timing over all local
758       // invocations.  The number of local invocations is the call
759       // count.  Thus, the mean-over-call-counts timing is the sum of
760       // all the timings (over all processes), divided by the sum of
761       // all the call counts (over all processes).  We compute it in a
762       // different way to over unnecessary overflow.
763       Array<double> meanOverCallCountsTimings (numTimers);
764       Array<double> meanOverProcsTimings (numTimers);
765       Array<double> meanCallCounts (numTimers);
766       Array<int>    ICallThisTimer (numTimers);
767       Array<int>    numProcsCallingEachTimer (numTimers);
768       {
769         // Figure out how many processors actually call each timer.
770         if (ignoreZeroTimers) {
771           for (int k = 0; k < numTimers; ++k) {
772             const double callCount = static_cast<double> (timingsAndCallCounts[k].second);
773             if (callCount > 0) ICallThisTimer[k] = 1;
774             else               ICallThisTimer[k] = 0;
775           }
776           if (numTimers > 0) {
777             reduceAll (*comm, REDUCE_SUM, numTimers, &ICallThisTimer[0],
778                        &numProcsCallingEachTimer[0]);
779           }
780         }
781 
782         // When summing, first scale by the number of processes.  This
783         // avoids unnecessary overflow, and also gives us the mean
784         // call count automatically.
785         Array<double> scaledTimings (numTimers);
786         Array<double> scaledCallCounts (numTimers);
787         const double P = static_cast<double> (numProcs);
788 
789         if (ignoreZeroTimers) {
790           for (int k = 0; k < numTimers; ++k) {
791             const double timing = timingsAndCallCounts[k].first;
792             const double callCount = static_cast<double> (timingsAndCallCounts[k].second);
793 
794             scaledTimings[k] = timing / numProcsCallingEachTimer[k];
795             scaledCallCounts[k] = callCount / numProcsCallingEachTimer[k];
796           }
797         }
798         else {
799           for (int k = 0; k < numTimers; ++k) {
800             const double timing = timingsAndCallCounts[k].first;
801             const double callCount = static_cast<double> (timingsAndCallCounts[k].second);
802 
803             scaledTimings[k] = timing / P;
804             scaledCallCounts[k] = callCount / P;
805           }
806         }
807 
808         if (numTimers > 0) {
809           reduceAll (*comm, REDUCE_SUM, numTimers, &scaledTimings[0],
810                      &meanOverProcsTimings[0]);
811           reduceAll (*comm, REDUCE_SUM, numTimers, &scaledCallCounts[0],
812                      &meanCallCounts[0]);
813         }
814         // We don't have to undo the scaling for the mean timings;
815         // just divide by the scaled call count.
816         for (int k = 0; k < numTimers; ++k) {
817           if (meanCallCounts[k] > ScalarTraits<double>::zero ()) {
818             meanOverCallCountsTimings[k] = meanOverProcsTimings[k] / meanCallCounts[k];
819           }
820           else {
821             meanOverCallCountsTimings[k] = ScalarTraits<double>::zero ();
822           }
823         }
824       }
825 
826       // Reformat the data into the map of statistics.  Be sure that
827       // each value (the std::vector of (timing, call count) pairs,
828       // each entry of which is a different statistic) preserves the
829       // order of statNames.
830       statNames.resize (4);
831       statNames[0] = "MinOverProcs";
832       statNames[1] = "MeanOverProcs";
833       statNames[2] = "MaxOverProcs";
834       statNames[3] = "MeanOverCallCounts";
835 
836       stat_map_type::iterator statIter = statData.end();
837       timer_map_t::const_iterator it = globalTimerData.begin();
838       for (int k = 0; it != globalTimerData.end(); ++k, ++it) {
839         std::vector<std::pair<double, double> > curData (4);
840         curData[0] = minTimingsAndCallCounts[k];
841         curData[1] = std::make_pair (meanOverProcsTimings[k], meanCallCounts[k]);
842         curData[2] = maxTimingsAndCallCounts[k];
843         curData[3] = std::make_pair (meanOverCallCountsTimings[k], meanCallCounts[k]);
844 
845         // statIter gives an insertion location hint that makes each
846         // insertion O(1), since we remember the location of the last
847         // insertion.
848         statIter = statData.insert (statIter, std::make_pair (it->first, curData));
849       }
850     }
851 
852 
853     /// \brief Get a default communicator appropriate for the environment.
854     ///
855     /// If Trilinos was configured with MPI support, and if MPI has
856     /// been initialized, return a wrapped MPI_COMM_WORLD.  If
857     /// Trilinos was configured with MPI support, and if MPI has not
858     /// yet been initialized, return a serial communicator (containing
859     /// one process).  If Trilinos was <i>not</i> configured with MPI
860     /// support, return a serial communicator.
861     ///
862     /// Rationale: Callers may or may not have initialized MPI before
863     /// calling this method.  Just because they built with MPI,
864     /// doesn't mean they want to use MPI.  It's not my responsibility
865     /// to initialize MPI for them, and I don't have the context I
866     /// need in order to do so anyway.  Thus, if Trilinos was built
867     /// with MPI and MPI has not yet been initialized, this method
868     /// returns a "serial" communicator.
869     RCP<const Comm<int> >
getDefaultComm()870     getDefaultComm ()
871     {
872       // The default communicator.  If Trilinos was built with MPI
873       // enabled, this should be MPI_COMM_WORLD.  (If MPI has not yet
874       // been initialized, it's not valid to use the communicator!)
875       // Otherwise, this should be a "serial" (no MPI, one "process")
876       // communicator.
877       RCP<const Comm<int> > comm = DefaultComm<int>::getComm ();
878 
879 #ifdef HAVE_MPI
880       {
881         int mpiHasBeenStarted = 0;
882         MPI_Initialized (&mpiHasBeenStarted);
883         if (! mpiHasBeenStarted) {
884           // Make pComm a new "serial communicator."
885           comm = rcp_implicit_cast<const Comm<int> > (rcp (new SerialComm<int> ()));
886         }
887       }
888 #endif // HAVE_MPI
889       return comm;
890     }
891 
892   } // namespace (anonymous)
893 
894 
895   void
computeGlobalTimerStatistics(stat_map_type & statData,std::vector<std::string> & statNames,Ptr<const Comm<int>> comm,const ECounterSetOp setOp,const std::string & filter)896   TimeMonitor::computeGlobalTimerStatistics (stat_map_type& statData,
897                                              std::vector<std::string>& statNames,
898                                              Ptr<const Comm<int> > comm,
899                                              const ECounterSetOp setOp,
900                                              const std::string& filter)
901   {
902     // Collect local timer data and names.  Filter out timers with
903     // zero call counts if writeZeroTimers is false.  Also, apply the
904     // timer label filter at this point, so we don't have to compute
905     // statistics on timers we don't want to display anyway.
906     timer_map_t localTimerData;
907     Array<std::string> localTimerNames;
908     const bool writeZeroTimers = false;
909     collectLocalTimerDataAndNames (localTimerData, localTimerNames,
910                                    counters(), writeZeroTimers, filter);
911     // Merge the local timer data and names into global timer data and
912     // names.
913     timer_map_t globalTimerData;
914     Array<std::string> globalTimerNames;
915     const bool alwaysWriteLocal = false;
916     collectGlobalTimerData (globalTimerData, globalTimerNames,
917                             localTimerData, localTimerNames,
918                             comm, alwaysWriteLocal, setOp);
919     // Compute statistics on the data.
920     computeGlobalTimerStats (statData, statNames, comm, globalTimerData, false);
921   }
922 
923 
924   void
summarize(Ptr<const Comm<int>> comm,std::ostream & out,const bool alwaysWriteLocal,const bool writeGlobalStats,const bool writeZeroTimers,const ECounterSetOp setOp,const std::string & filter,const bool ignoreZeroTimers)925   TimeMonitor::summarize (Ptr<const Comm<int> > comm,
926                           std::ostream& out,
927                           const bool alwaysWriteLocal,
928                           const bool writeGlobalStats,
929                           const bool writeZeroTimers,
930                           const ECounterSetOp setOp,
931                           const std::string& filter,
932                           const bool ignoreZeroTimers)
933   {
934     //
935     // We can't just call computeGlobalTimerStatistics(), since
936     // summarize() has different options that affect whether global
937     // statistics are computed and printed.
938     //
939     const int numProcs = comm->getSize();
940     const int myRank = comm->getRank();
941 
942     // Collect local timer data and names.  Filter out timers with
943     // zero call counts if writeZeroTimers is false.  Also, apply the
944     // timer label filter at this point, so we don't have to compute
945     // statistics on timers we don't want to display anyway.
946     timer_map_t localTimerData;
947     Array<std::string> localTimerNames;
948     collectLocalTimerDataAndNames (localTimerData, localTimerNames,
949                                    counters(), writeZeroTimers, filter);
950 
951     // If we're computing global statistics, merge the local timer
952     // data and names into global timer data and names, and compute
953     // global timer statistics.  Otherwise, leave the global data
954     // empty.
955     timer_map_t globalTimerData;
956     Array<std::string> globalTimerNames;
957     stat_map_type statData;
958     std::vector<std::string> statNames;
959     if (writeGlobalStats) {
960       collectGlobalTimerData (globalTimerData, globalTimerNames,
961                               localTimerData, localTimerNames,
962                               comm, alwaysWriteLocal, setOp);
963       // Compute statistics on the data, but only if the communicator
964       // contains more than one process.  Otherwise, statistics don't
965       // make sense and we don't print them (see below).
966       if (numProcs > 1) {
967         computeGlobalTimerStats (statData, statNames, comm, globalTimerData, ignoreZeroTimers);
968       }
969     }
970 
971     // Precision of floating-point numbers in the table.
972     const int precision = format().precision();
973     const std::ios_base::fmtflags& flags = out.flags();
974 
975     // All columns of the table, in order.
976     Array<TableColumn> tableColumns;
977 
978     // Labels of all the columns of the table.
979     // We will append to this when we add each column.
980     Array<std::string> titles;
981 
982     // Widths (in number of characters) of each column.
983     // We will append to this when we add each column.
984     Array<int> columnWidths;
985 
986     // Table column containing all timer names.  If writeGlobalStats
987     // is true, we use the global timer names, otherwise we use the
988     // local timer names.  We build the table on all processes
989     // redundantly, but only print on Rank 0.
990     {
991       titles.append ("Timer Name");
992 
993       // The column labels depend on whether we are computing global statistics.
994       TableColumn nameCol (writeGlobalStats ? globalTimerNames : localTimerNames);
995       tableColumns.append (nameCol);
996 
997       // Each column is as wide as it needs to be to hold both its
998       // title and all of the column data.  This column's title is the
999       // current last entry of the titles array.
1000       columnWidths.append (format().computeRequiredColumnWidth (titles.back(), nameCol));
1001     }
1002 
1003     // Table column containing local timer stats, if applicable.  We
1004     // only write local stats if asked, only on MPI Proc 0, and only
1005     // if there is more than one MPI process in the communicator
1006     // (otherwise local stats == global stats, so we just print the
1007     // global stats).  In this case, we've padded the local data on
1008     // Proc 0 if necessary to match the global timer list, so that the
1009     // columns have the same number of rows.
1010     if (alwaysWriteLocal && numProcs > 1 && myRank == 0) {
1011       titles.append ("Local time (num calls)");
1012 
1013       // Copy local timer data out of the array-of-structs into
1014       // separate arrays, for display in the table.
1015       Array<double> localTimings;
1016       Array<double> localNumCalls;
1017       for (timer_map_t::const_iterator it = localTimerData.begin();
1018            it != localTimerData.end(); ++it) {
1019         localTimings.push_back (it->second.first);
1020         localNumCalls.push_back (static_cast<double> (it->second.second));
1021       }
1022       TableColumn timeAndCalls (localTimings, localNumCalls, precision, flags, true);
1023       tableColumns.append (timeAndCalls);
1024       columnWidths.append (format().computeRequiredColumnWidth (titles.back(), timeAndCalls));
1025     }
1026 
1027     if (writeGlobalStats) {
1028       // If there's only 1 process in the communicator, don't display
1029       // statistics; statistics don't make sense in that case.  Just
1030       // display the timings and call counts.  If there's more than 1
1031       // process, do display statistics.
1032       if (numProcs == 1) {
1033         // Extract timings and the call counts from globalTimerData.
1034         Array<double> globalTimings;
1035         Array<double> globalNumCalls;
1036         for (timer_map_t::const_iterator it = globalTimerData.begin();
1037              it != globalTimerData.end(); ++it) {
1038           globalTimings.push_back (it->second.first);
1039           globalNumCalls.push_back (static_cast<double> (it->second.second));
1040         }
1041         // Print the table column.
1042         titles.append ("Global time (num calls)");
1043         TableColumn timeAndCalls (globalTimings, globalNumCalls, precision, flags, true);
1044         tableColumns.append (timeAndCalls);
1045         columnWidths.append (format().computeRequiredColumnWidth (titles.back(), timeAndCalls));
1046       }
1047       else { // numProcs > 1
1048         // Print a table column for each statistic.  statNames and
1049         // each value in statData use the same ordering, so we can
1050         // iterate over valid indices of statNames to display the
1051         // statistics in the right order.
1052         const timer_map_t::size_type numGlobalTimers = globalTimerData.size();
1053         for (std::vector<std::string>::size_type statInd = 0; statInd < statNames.size(); ++statInd) {
1054           // Extract lists of timings and their call counts for the
1055           // current statistic.
1056           Array<double> statTimings (numGlobalTimers);
1057           Array<double> statCallCounts (numGlobalTimers);
1058           stat_map_type::const_iterator it = statData.begin();
1059           for (int k = 0; it != statData.end(); ++it, ++k) {
1060             statTimings[k] = (it->second[statInd]).first;
1061             statCallCounts[k] = (it->second[statInd]).second;
1062           }
1063           // Print the table column.
1064           const std::string& statisticName = statNames[statInd];
1065           const std::string titleString = statisticName;
1066           titles.append (titleString);
1067           TableColumn timeAndCalls (statTimings, statCallCounts, precision, flags, true);
1068           tableColumns.append (timeAndCalls);
1069           columnWidths.append (format().computeRequiredColumnWidth (titles.back(), timeAndCalls));
1070         }
1071       }
1072     }
1073 
1074     // Print the whole table to the given output stream on MPI Rank 0.
1075     format().setColumnWidths (columnWidths);
1076     if (myRank == 0) {
1077       std::ostringstream theTitle;
1078       theTitle << "TimeMonitor results over " << numProcs << " processor"
1079                << (numProcs > 1 ? "s" : "");
1080       format().writeWholeTable (out, theTitle.str(), titles, tableColumns);
1081     }
1082   }
1083 
1084   void
summarize(std::ostream & out,const bool alwaysWriteLocal,const bool writeGlobalStats,const bool writeZeroTimers,const ECounterSetOp setOp,const std::string & filter,const bool ignoreZeroTimers)1085   TimeMonitor::summarize (std::ostream &out,
1086                           const bool alwaysWriteLocal,
1087                           const bool writeGlobalStats,
1088                           const bool writeZeroTimers,
1089                           const ECounterSetOp setOp,
1090                           const std::string& filter,
1091                           const bool ignoreZeroTimers)
1092   {
1093     // The default communicator.  If Trilinos was built with MPI
1094     // enabled, this should be MPI_COMM_WORLD.  Otherwise, this should
1095     // be a "serial" (no MPI, one "process") communicator.
1096     RCP<const Comm<int> > comm = getDefaultComm();
1097 
1098     summarize (comm.ptr(), out, alwaysWriteLocal,
1099                writeGlobalStats, writeZeroTimers, setOp, filter, ignoreZeroTimers);
1100   }
1101 
1102   void
computeGlobalTimerStatistics(stat_map_type & statData,std::vector<std::string> & statNames,const ECounterSetOp setOp,const std::string & filter)1103   TimeMonitor::computeGlobalTimerStatistics (stat_map_type& statData,
1104                                              std::vector<std::string>& statNames,
1105                                              const ECounterSetOp setOp,
1106                                              const std::string& filter)
1107   {
1108     // The default communicator.  If Trilinos was built with MPI
1109     // enabled, this should be MPI_COMM_WORLD.  Otherwise, this should
1110     // be a "serial" (no MPI, one "process") communicator.
1111     RCP<const Comm<int> > comm = getDefaultComm();
1112 
1113     computeGlobalTimerStatistics (statData, statNames, comm.ptr(), setOp, filter);
1114   }
1115 
SyncTimeMonitor(Time & timer,Ptr<const Comm<int>> comm,bool reset)1116   SyncTimeMonitor::SyncTimeMonitor(Time& timer, Ptr<const Comm<int> > comm, bool reset)
1117     : TimeMonitor(timer, reset), comm_(comm)
1118   { }
1119 
~SyncTimeMonitor()1120   SyncTimeMonitor::~SyncTimeMonitor() {
1121     comm_->barrier();
1122   }
1123 
1124 
1125   namespace {
1126     /// \brief Quote the given string for YAML output.
1127     ///
1128     /// TimeMonitor allows users to provide arbitrary strings as timer
1129     /// labels.  It also allows developers to do the same for
1130     /// statistics labels.  Since YAML has a particular syntax in
1131     /// which certain characters indicate structure, and since we
1132     /// include timer labels in YAML output, we have to modify timer
1133     /// labels slightly in order for them not to violate YAML
1134     /// requirements.
1135     ///
1136     /// We begin by quoting the string (if not already quoted) if it
1137     /// contains a colon.  This is because YAML separates each key
1138     /// from its value in a key: value pair using a colon.  That means
1139     /// that if the key itself contains a colon, we need to quote the
1140     /// whole key (which we do using double quotes).  We use double
1141     /// quotes since YAML allows double-quoted strings to contain
1142     /// anything, vs. just printable characters.
1143     ///
1144     /// We also quote the string if any characters in it need
1145     /// escaping.  For now, we escape double quotes (not counting
1146     /// those that quote whole string) and backslashes.  For an
1147     /// example of quoting double quotes in a string, see Example 5.13
1148     /// in the YAML 1.2 spec.
1149     std::string
quoteLabelForYaml(const std::string & label)1150     quoteLabelForYaml (const std::string& label)
1151     {
1152       // YAML allows empty keys in key: value pairs.  See Section 7.2
1153       // of the YAML 1.2 spec.  We thus let an empty label pass
1154       // through without quoting or other special treatment.
1155       if (label.empty ()) {
1156         return label;
1157       }
1158 
1159       // Check whether the label is already quoted.  If so, we don't
1160       // need to quote it again.  However, we do need to quote any
1161       // quote symbols in the string inside the outer quotes.
1162       const bool alreadyQuoted = label.size () >= 2 &&
1163         label[0] == '"' && label[label.size() - 1] == '"';
1164 
1165       // We need to quote if there are any colons or (inner) quotes in
1166       // the string.  We'll determine this as we read through the
1167       // string and escape any characters that need escaping.
1168       bool needToQuote = false;
1169 
1170       std::string out; // To fill with the return value
1171       out.reserve (label.size ());
1172 
1173       const size_t startPos = alreadyQuoted ? 1 : 0;
1174       const size_t endPos = alreadyQuoted ? label.size () - 1 : label.size ();
1175       for (size_t i = startPos; i < endPos; ++i) {
1176         const char c = label[i];
1177         if (c == '"' || c == '\\') {
1178           out.push_back ('\\'); // Escape the quote or backslash.
1179           needToQuote = true;
1180         }
1181         else if (c == ':') {
1182           needToQuote = true;
1183         }
1184         out.push_back (c);
1185       }
1186 
1187       if (needToQuote || alreadyQuoted) {
1188         // If the input string was already quoted, then out doesn't
1189         // include its quotes, so we have to add them back in.
1190         return "\"" + out + "\"";
1191       }
1192       else {
1193         return out;
1194       }
1195     }
1196 
1197   } // namespace (anonymous)
1198 
1199 
1200   void TimeMonitor::
summarizeToYaml(Ptr<const Comm<int>> comm,std::ostream & out,const ETimeMonitorYamlFormat yamlStyle,const std::string & filter)1201   summarizeToYaml (Ptr<const Comm<int> > comm,
1202                    std::ostream &out,
1203                    const ETimeMonitorYamlFormat yamlStyle,
1204                    const std::string& filter)
1205   {
1206     using Teuchos::FancyOStream;
1207     using Teuchos::fancyOStream;
1208     using Teuchos::getFancyOStream;
1209     using Teuchos::OSTab;
1210     using Teuchos::RCP;
1211     using Teuchos::rcpFromRef;
1212     using std::endl;
1213     typedef std::vector<std::string>::size_type size_type;
1214 
1215     const bool compact = (yamlStyle == YAML_FORMAT_COMPACT);
1216 
1217     // const bool writeGlobalStats = true;
1218     // const bool writeZeroTimers = true;
1219     // const bool alwaysWriteLocal = false;
1220     const ECounterSetOp setOp = Intersection;
1221 
1222     stat_map_type statData;
1223     std::vector<std::string> statNames;
1224     computeGlobalTimerStatistics (statData, statNames, comm, setOp, filter);
1225 
1226     const int numProcs = comm->getSize();
1227 
1228     // HACK (mfh 20 Aug 2012) For some reason, creating OSTab with "-
1229     // " as the line prefix does not work, else I would prefer that
1230     // method for printing each line of a YAML block sequence (see
1231     // Section 8.2.1 of the YAML 1.2 spec).
1232     //
1233     // Also, I have to set the tab indent string here, rather than in
1234     // OSTab's constructor.  This is because line prefix (which for
1235     // some reason is what OSTab's constructor takes, rather than tab
1236     // indent string) means something different from tab indent
1237     // string, and turning on the line prefix prints all sorts of
1238     // things including "|" for some reason.
1239     RCP<FancyOStream> pfout = getFancyOStream (rcpFromRef (out));
1240     pfout->setTabIndentStr ("  ");
1241     FancyOStream& fout = *pfout;
1242 
1243     fout << "# Teuchos::TimeMonitor report" << endl
1244          << "---" << endl;
1245 
1246     // mfh 19 Aug 2012: An important goal of our chosen output format
1247     // was to minimize the nesting depth.  We have managed to keep the
1248     // nesting depth to 3, which is the limit that the current version
1249     // of PylotDB imposes for its YAML input.
1250 
1251     // Outermost level is a dictionary.  (Individual entries of a
1252     // dictionary do _not_ begin with "- ".)  We always print the
1253     // outermost level in standard style, not flow style, for better
1254     // readability.  We begin the outermost level with metadata.
1255     fout << "Output mode: " << (compact ? "compact" : "spacious") << endl
1256          << "Number of processes: " << numProcs << endl
1257          << "Time unit: s" << endl;
1258     // For a key: value pair where the value is a sequence or
1259     // dictionary on the following line, YAML requires a space after
1260     // the colon.
1261     fout << "Statistics collected: ";
1262     // Print list of the names of all the statistics we collected.
1263     if (compact) {
1264       fout << " [";
1265       for (size_type i = 0; i < statNames.size (); ++i) {
1266         fout << quoteLabelForYaml (statNames[i]);
1267         if (i + 1 < statNames.size ()) {
1268           fout << ", ";
1269         }
1270       }
1271       fout << "]" << endl;
1272     }
1273     else {
1274       fout << endl;
1275       OSTab tab1 (pfout);
1276       for (size_type i = 0; i < statNames.size (); ++i) {
1277         fout << "- " << quoteLabelForYaml (statNames[i]) << endl;
1278       }
1279     }
1280 
1281     // Print the list of timer names.
1282     //
1283     // It might be nicer instead to print a map from timer name to all
1284     // of its data, but keeping the maximum nesting depth small
1285     // ensures better compatibility with different parsing tools.
1286     fout << "Timer names: ";
1287     if (compact) {
1288       fout << " [";
1289       size_type ind = 0;
1290       for (stat_map_type::const_iterator it = statData.begin();
1291            it != statData.end(); ++it, ++ind) {
1292         fout << quoteLabelForYaml (it->first);
1293         if (ind + 1 < statData.size ()) {
1294           fout << ", ";
1295         }
1296       }
1297       fout << "]" << endl;
1298     }
1299     else {
1300       fout << endl;
1301       OSTab tab1 (pfout);
1302       for (stat_map_type::const_iterator it = statData.begin();
1303            it != statData.end(); ++it) {
1304         fout << "- " << quoteLabelForYaml (it->first) << endl;
1305       }
1306     }
1307 
1308     // Print times for each timer, as a map from statistic name to its time.
1309     fout << "Total times: ";
1310     if (compact) {
1311       fout << " {";
1312       size_type outerInd = 0;
1313       for (stat_map_type::const_iterator outerIter = statData.begin();
1314            outerIter != statData.end(); ++outerIter, ++outerInd) {
1315         // Print timer name.
1316         fout << quoteLabelForYaml (outerIter->first) << ": ";
1317         // Print that timer's data.
1318         const std::vector<std::pair<double, double> >& curData = outerIter->second;
1319         fout << "{";
1320         for (size_type innerInd = 0; innerInd < curData.size (); ++innerInd) {
1321           fout << quoteLabelForYaml (statNames[innerInd]) << ": "
1322                << curData[innerInd].first;
1323           if (innerInd + 1 < curData.size ()) {
1324             fout << ", ";
1325           }
1326         }
1327         fout << "}";
1328         if (outerInd + 1 < statData.size ()) {
1329           fout << ", ";
1330         }
1331       }
1332       fout << "}" << endl;
1333     }
1334     else {
1335       fout << endl;
1336       OSTab tab1 (pfout);
1337       size_type outerInd = 0;
1338       for (stat_map_type::const_iterator outerIter = statData.begin();
1339            outerIter != statData.end(); ++outerIter, ++outerInd) {
1340         // Print timer name.
1341         fout << quoteLabelForYaml (outerIter->first) << ": " << endl;
1342         // Print that timer's data.
1343         OSTab tab2 (pfout);
1344         const std::vector<std::pair<double, double> >& curData = outerIter->second;
1345         for (size_type innerInd = 0; innerInd < curData.size (); ++innerInd) {
1346           fout << quoteLabelForYaml (statNames[innerInd]) << ": "
1347                << curData[innerInd].first << endl;
1348         }
1349       }
1350     }
1351 
1352     // Print call counts for each timer, for each statistic name.
1353     fout << "Call counts:";
1354     if (compact) {
1355       fout << " {";
1356       size_type outerInd = 0;
1357       for (stat_map_type::const_iterator outerIter = statData.begin();
1358            outerIter != statData.end(); ++outerIter, ++outerInd) {
1359         // Print timer name.
1360         fout << quoteLabelForYaml (outerIter->first) << ": ";
1361         // Print that timer's data.
1362         const std::vector<std::pair<double, double> >& curData = outerIter->second;
1363         fout << "{";
1364         for (size_type innerInd = 0; innerInd < curData.size (); ++innerInd) {
1365           fout << quoteLabelForYaml (statNames[innerInd]) << ": "
1366                << curData[innerInd].second;
1367           if (innerInd + 1 < curData.size ()) {
1368             fout << ", ";
1369           }
1370         }
1371         fout << "}";
1372         if (outerInd + 1 < statData.size ()) {
1373           fout << ", ";
1374         }
1375       }
1376       fout << "}" << endl;
1377     }
1378     else {
1379       fout << endl;
1380       OSTab tab1 (pfout);
1381       size_type outerInd = 0;
1382       for (stat_map_type::const_iterator outerIter = statData.begin();
1383            outerIter != statData.end(); ++outerIter, ++outerInd) {
1384         // Print timer name.
1385         fout << quoteLabelForYaml (outerIter->first) << ": " << endl;
1386         // Print that timer's data.
1387         OSTab tab2 (pfout);
1388         const std::vector<std::pair<double, double> >& curData = outerIter->second;
1389         for (size_type innerInd = 0; innerInd < curData.size (); ++innerInd) {
1390           fout << quoteLabelForYaml (statNames[innerInd]) << ": "
1391                << curData[innerInd].second << endl;
1392         }
1393       }
1394     }
1395   }
1396 
1397   void TimeMonitor::
summarizeToYaml(std::ostream & out,const ETimeMonitorYamlFormat yamlStyle,const std::string & filter)1398   summarizeToYaml (std::ostream &out,
1399                    const ETimeMonitorYamlFormat yamlStyle,
1400                    const std::string& filter)
1401   {
1402     // The default communicator.  If Trilinos was built with MPI
1403     // enabled, this should be MPI_COMM_WORLD.  Otherwise, this should
1404     // be a "serial" (no MPI, one "process") communicator.
1405     RCP<const Comm<int> > comm = getDefaultComm ();
1406 
1407     summarizeToYaml (comm.ptr (), out, yamlStyle, filter);
1408   }
1409 
1410   // Default value is false.  We'll set to true once
1411   // setReportParameters() completes successfully.
1412   bool TimeMonitor::setParams_ = false;
1413 
1414   // We have to declare all of these here in order to avoid linker errors.
1415   TimeMonitor::ETimeMonitorReportFormat TimeMonitor::reportFormat_ = TimeMonitor::REPORT_FORMAT_TABLE;
1416   TimeMonitor::ETimeMonitorYamlFormat TimeMonitor::yamlStyle_ = TimeMonitor::YAML_FORMAT_SPACIOUS;
1417   ECounterSetOp TimeMonitor::setOp_ = Intersection;
1418   bool TimeMonitor::alwaysWriteLocal_ = false;
1419   bool TimeMonitor::writeGlobalStats_ = true;
1420   bool TimeMonitor::writeZeroTimers_ = true;
1421 
1422   void
setReportFormatParameter(ParameterList & plist)1423   TimeMonitor::setReportFormatParameter (ParameterList& plist)
1424   {
1425     const std::string name ("Report format");
1426     const std::string defaultValue ("Table");
1427     const std::string docString ("Output format for report of timer statistics");
1428     Array<std::string> strings;
1429     Array<std::string> docs;
1430     Array<ETimeMonitorReportFormat> values;
1431 
1432     strings.push_back ("YAML");
1433     docs.push_back ("YAML (see yaml.org) format");
1434     values.push_back (REPORT_FORMAT_YAML);
1435     strings.push_back ("Table");
1436     docs.push_back ("Tabular format via Teuchos::TableFormat");
1437     values.push_back (REPORT_FORMAT_TABLE);
1438 
1439     setStringToIntegralParameter<ETimeMonitorReportFormat> (name, defaultValue,
1440                                                             docString,
1441                                                             strings (), docs (),
1442                                                             values (), &plist);
1443   }
1444 
1445   void
setYamlFormatParameter(ParameterList & plist)1446   TimeMonitor::setYamlFormatParameter (ParameterList& plist)
1447   {
1448     const std::string name ("YAML style");
1449     const std::string defaultValue ("spacious");
1450     const std::string docString ("YAML-specific output format");
1451     Array<std::string> strings;
1452     Array<std::string> docs;
1453     Array<ETimeMonitorYamlFormat> values;
1454 
1455     strings.push_back ("compact");
1456     docs.push_back ("Compact format: use \"flow style\" (see YAML 1.2 spec at "
1457                     "yaml.org) for most sequences except the outermost sequence");
1458     values.push_back (YAML_FORMAT_COMPACT);
1459 
1460     strings.push_back ("spacious");
1461     docs.push_back ("Spacious format: avoid flow style");
1462     values.push_back (YAML_FORMAT_SPACIOUS);
1463 
1464     setStringToIntegralParameter<ETimeMonitorYamlFormat> (name, defaultValue,
1465                                                           docString,
1466                                                           strings (), docs (),
1467                                                           values (), &plist);
1468   }
1469 
1470   void
setSetOpParameter(ParameterList & plist)1471   TimeMonitor::setSetOpParameter (ParameterList& plist)
1472   {
1473     const std::string name ("How to merge timer sets");
1474     const std::string defaultValue ("Intersection");
1475     const std::string docString ("How to merge differing sets of timers "
1476                                  "across processes");
1477     Array<std::string> strings;
1478     Array<std::string> docs;
1479     Array<ECounterSetOp> values;
1480 
1481     strings.push_back ("Intersection");
1482     docs.push_back ("Compute intersection of timer sets over processes");
1483     values.push_back (Intersection);
1484     strings.push_back ("Union");
1485     docs.push_back ("Compute union of timer sets over processes");
1486     values.push_back (Union);
1487 
1488     setStringToIntegralParameter<ECounterSetOp> (name, defaultValue, docString,
1489                                                  strings (), docs (), values (),
1490                                                  &plist);
1491   }
1492 
1493   void
setStackedTimer(const Teuchos::RCP<Teuchos::StackedTimer> & t)1494   TimeMonitor::setStackedTimer(const Teuchos::RCP<Teuchos::StackedTimer>& t)
1495   {
1496     stackedTimer_ = t;
1497   }
1498 
1499   Teuchos::RCP<Teuchos::StackedTimer>
getStackedTimer()1500   TimeMonitor::getStackedTimer()
1501   {
1502     return stackedTimer_;
1503   }
1504 
1505   RCP<const ParameterList>
getValidReportParameters()1506   TimeMonitor::getValidReportParameters ()
1507   {
1508     // Our implementation favors recomputation over persistent
1509     // storage.  That is, we simply recreate the list every time we
1510     // need it.
1511     RCP<ParameterList> plist = parameterList ("TimeMonitor::report");
1512 
1513     const bool alwaysWriteLocal = false;
1514     const bool writeGlobalStats = true;
1515     const bool writeZeroTimers = true;
1516 
1517     setReportFormatParameter (*plist);
1518     setYamlFormatParameter (*plist);
1519     setSetOpParameter (*plist);
1520     plist->set ("alwaysWriteLocal", alwaysWriteLocal,
1521                 "Always output local timers' values on Proc 0");
1522     plist->set ("writeGlobalStats", writeGlobalStats, "Always output global "
1523                 "statistics, even if there is only one process in the "
1524                 "communicator");
1525     plist->set ("writeZeroTimers", writeZeroTimers, "Generate output for "
1526                 "timers that have never been called");
1527 
1528     return rcp_const_cast<const ParameterList> (plist);
1529   }
1530 
1531   void
setReportParameters(const RCP<ParameterList> & params)1532   TimeMonitor::setReportParameters (const RCP<ParameterList>& params)
1533   {
1534     ETimeMonitorReportFormat reportFormat = REPORT_FORMAT_TABLE;
1535     ETimeMonitorYamlFormat yamlStyle = YAML_FORMAT_SPACIOUS;
1536     ECounterSetOp setOp = Intersection;
1537     bool alwaysWriteLocal = false;
1538     bool writeGlobalStats = true;
1539     bool writeZeroTimers = true;
1540 
1541     if (params.is_null ()) {
1542       // If we've set parameters before, leave their current values.
1543       // Otherwise, set defaults (below).
1544       if (setParams_) {
1545         return;
1546       }
1547     }
1548     else { // params is nonnull.  Let's read it!
1549       params->validateParametersAndSetDefaults (*getValidReportParameters ());
1550 
1551       reportFormat = getIntegralValue<ETimeMonitorReportFormat> (*params, "Report format");
1552       yamlStyle = getIntegralValue<ETimeMonitorYamlFormat> (*params, "YAML style");
1553       setOp = getIntegralValue<ECounterSetOp> (*params, "How to merge timer sets");
1554       alwaysWriteLocal = params->get<bool> ("alwaysWriteLocal");
1555       writeGlobalStats = params->get<bool> ("writeGlobalStats");
1556       writeZeroTimers = params->get<bool> ("writeZeroTimers");
1557     }
1558     // Defer setting state until here, to ensure the strong exception
1559     // guarantee for this method (either it throws with no externally
1560     // visible state changes, or it returns normally).
1561     reportFormat_ = reportFormat;
1562     yamlStyle_ = yamlStyle;
1563     setOp_ = setOp;
1564     alwaysWriteLocal_ = alwaysWriteLocal;
1565     writeGlobalStats_ = writeGlobalStats;
1566     writeZeroTimers_ = writeZeroTimers;
1567 
1568     setParams_ = true; // Yay, we successfully set parameters!
1569   }
1570 
1571   void
report(Ptr<const Comm<int>> comm,std::ostream & out,const std::string & filter,const RCP<ParameterList> & params)1572   TimeMonitor::report (Ptr<const Comm<int> > comm,
1573                        std::ostream& out,
1574                        const std::string& filter,
1575                        const RCP<ParameterList>& params)
1576   {
1577     setReportParameters (params);
1578 
1579     if (reportFormat_ == REPORT_FORMAT_YAML) {
1580       summarizeToYaml (comm, out, yamlStyle_, filter);
1581     }
1582     else if (reportFormat_ == REPORT_FORMAT_TABLE) {
1583       summarize (comm, out, alwaysWriteLocal_, writeGlobalStats_,
1584                  writeZeroTimers_, setOp_, filter);
1585     }
1586     else {
1587       TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "TimeMonitor::report: "
1588         "Invalid report format.  This should never happen; ParameterList "
1589         "validation should have caught this.  Please report this bug to the "
1590         "Teuchos developers.");
1591     }
1592   }
1593 
1594   void
report(Ptr<const Comm<int>> comm,std::ostream & out,const RCP<ParameterList> & params)1595   TimeMonitor::report (Ptr<const Comm<int> > comm,
1596                        std::ostream& out,
1597                        const RCP<ParameterList>& params)
1598   {
1599     report (comm, out, "", params);
1600   }
1601 
1602   void
report(std::ostream & out,const std::string & filter,const RCP<ParameterList> & params)1603   TimeMonitor::report (std::ostream& out,
1604                        const std::string& filter,
1605                        const RCP<ParameterList>& params)
1606   {
1607     RCP<const Comm<int> > comm = getDefaultComm ();
1608     report (comm.ptr (), out, filter, params);
1609   }
1610 
1611   void
report(std::ostream & out,const RCP<ParameterList> & params)1612   TimeMonitor::report (std::ostream& out,
1613                        const RCP<ParameterList>& params)
1614   {
1615     RCP<const Comm<int> > comm = getDefaultComm ();
1616     report (comm.ptr (), out, "", params);
1617   }
1618 
1619 } // namespace Teuchos
1620