1 // @HEADER 2 // *********************************************************************** 3 // 4 // Teuchos: Common Tools Package 5 // Copyright (2004) Sandia Corporation 6 // 7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 8 // license for use of this work by or on behalf of the U.S. Government. 9 // 10 // Redistribution and use in source and binary forms, with or without 11 // modification, are permitted provided that the following conditions are 12 // met: 13 // 14 // 1. Redistributions of source code must retain the above copyright 15 // notice, this list of conditions and the following disclaimer. 16 // 17 // 2. Redistributions in binary form must reproduce the above copyright 18 // notice, this list of conditions and the following disclaimer in the 19 // documentation and/or other materials provided with the distribution. 20 // 21 // 3. Neither the name of the Corporation nor the names of the 22 // contributors may be used to endorse or promote products derived from 23 // this software without specific prior written permission. 24 // 25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 // 37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 38 // 39 // *********************************************************************** 40 // @HEADER 41 42 #include "Teuchos_TimeMonitor.hpp" 43 #include "Teuchos_CommHelpers.hpp" 44 #include "Teuchos_DefaultComm.hpp" 45 #include "Teuchos_TableColumn.hpp" 46 #include "Teuchos_TableFormat.hpp" 47 #include "Teuchos_StandardParameterEntryValidators.hpp" 48 #include "Teuchos_ScalarTraits.hpp" 49 #include "Teuchos_StackedTimer.hpp" 50 51 #include <functional> 52 #include <iomanip> 53 #ifdef HAVE_TEUCHOS_ADD_TIME_MONITOR_TO_STACKED_TIMER 54 #include <sstream> 55 #endif 56 57 namespace Teuchos { 58 /** 59 * \class MaxLoc 60 * \brief Teuchos version of MPI_MAXLOC. 61 * \author Mark Hoemmen 62 * 63 * \tparam Ordinal The template parameter of \c Comm. 64 * \tparam ScalarType Type for which to find the maximum. 65 * \tparam IndexType Type indicating the index of the maximum. 66 * 67 * \c MPI_MAXLOC is a standard reduction operator provided by the 68 * MPI standard. According to the standard, \c MPI_MAXLOC combines 69 * the (value, index) pairs (u,i) and (v,j) into (w,j), where \f$w = 70 * max(u,v)\f$, and 71 * \f[ 72 * k = \begin{cases} 73 * i & \text{if $u > v$}, \\ 74 * \min(i,j) & \text{if $u = v$}, \\ 75 * j & \text{if $u < v$}. \\ 76 * \end{cases} 77 * \f] 78 79 * This class implements the \c MPI_MAXLOC reduction operator for 80 * the Teuchos communication wrappers. We need to define this 81 * separately from MPI for two reasons. First, the Teuchos comm 82 * wrappers do not currently expose taking an MPI_Op. Second, we 83 * need MaxLoc to be templated on the scalar and index types, in 84 * order to match Teuchos' MPI interface. 85 * 86 * What happens to NaN ("Not a Number")? A NaN is neither less 87 * than, greater than, or equal to any floating-point number or any 88 * NaN. We can alter the above definition slightly so that a \c 89 * MaxLoc reduction has a well-defined result in case the array 90 * contains a NaN: 91 * \f[ 92 * w = \begin{cases} 93 * u & \text{if $u > v$}, \\ 94 * v & \text{if $u < v$}. \\ 95 * u & \text{otherwise}. \\ 96 * \end{cases} 97 * \f] 98 * and 99 * \f[ 100 * k = \begin{cases} 101 * i & \text{if $u > v$}, \\ 102 * j & \text{if $u < v$}. \\ 103 * \min(i,j) & \text{otherwise}. \\ 104 * \end{cases} 105 * \f] 106 * Defining \c MaxLoc in this way ensures that for any array 107 * containing a NaN, the value (w) returned is the first NaN, and 108 * the index (k) returned is the index of the first NaN. 109 */ 110 template<class Ordinal, class ScalarType, class IndexType> 111 class MaxLoc : 112 public ValueTypeReductionOp<Ordinal, std::pair<ScalarType, IndexType> > { 113 public: 114 void 115 reduce (const Ordinal count, 116 const std::pair<ScalarType, IndexType> inBuffer[], 117 std::pair<ScalarType, IndexType> inoutBuffer[]) const; 118 }; 119 120 template<class Ordinal> 121 class MaxLoc<Ordinal, double, int> : 122 public ValueTypeReductionOp<Ordinal, std::pair<double, int> > { 123 public: 124 void reduce(const Ordinal count,const std::pair<double,int> inBuffer[],std::pair<double,int> inoutBuffer[]) const125 reduce (const Ordinal count, 126 const std::pair<double, int> inBuffer[], 127 std::pair<double, int> inoutBuffer[]) const 128 { 129 for (Ordinal ind = 0; ind < count; ++ind) { 130 const std::pair<double, int>& in = inBuffer[ind]; 131 std::pair<double, int>& inout = inoutBuffer[ind]; 132 133 if (in.first > inout.first) { 134 inout.first = in.first; 135 inout.second = in.second; 136 } else if (in.first < inout.first) { 137 // Don't need to do anything; inout has the values. 138 } else { // equal, or at least one is NaN. 139 inout.first = in.first; 140 inout.second = std::min (in.second, inout.second); 141 } 142 } 143 } 144 }; 145 146 /** \class MinLoc 147 * \brief Teuchos version of MPI_MINLOC. 148 * \author Mark Hoemmen 149 * 150 * \tparam Ordinal The template parameter of \c Comm. 151 * \tparam ScalarType Type for which to find the minimum. 152 * \tparam IndexType Type indicating the index of the minimum. 153 * 154 * \c MPI_MINLOC is a standard reduction operator provided by the 155 * MPI standard. According to the standard, \c MPI_MINLOC combines 156 * the (value, index) pairs (u,i) and (v,j) into (w,j), where \f$w = 157 * min(u,v)\f$, and 158 * \f[ 159 * k = \begin{cases} 160 * i & \text{if $u < v$}, \\ 161 * \min(i,j) & \text{if $u = v$}, \\ 162 * j & \text{if $u > v$}. \\ 163 * \end{cases} 164 * \f] 165 * This class implements the \c MPI_MINLOC reduction operator for 166 * the Teuchos communication wrappers. 167 * 168 * Refer to the note in the documentation of \c MaxLoc that 169 * explains how we adjust the above definition to produce 170 * well-defined results even if the array contains a NaN. 171 */ 172 template<class Ordinal, class ScalarType, class IndexType> 173 class MinLoc : 174 public ValueTypeReductionOp<Ordinal, std::pair<ScalarType, IndexType> > { 175 public: 176 void 177 reduce (const Ordinal count, 178 const std::pair<ScalarType, IndexType> inBuffer[], 179 std::pair<ScalarType, IndexType> inoutBuffer[]) const; 180 }; 181 182 template<class Ordinal> 183 class MinLoc<Ordinal, double, int> : 184 public ValueTypeReductionOp<Ordinal, std::pair<double, int> > { 185 public: 186 void reduce(const Ordinal count,const std::pair<double,int> inBuffer[],std::pair<double,int> inoutBuffer[]) const187 reduce (const Ordinal count, 188 const std::pair<double, int> inBuffer[], 189 std::pair<double, int> inoutBuffer[]) const 190 { 191 for (Ordinal ind = 0; ind < count; ++ind) { 192 const std::pair<double, int>& in = inBuffer[ind]; 193 std::pair<double, int>& inout = inoutBuffer[ind]; 194 195 if (in.first < inout.first) { 196 inout.first = in.first; 197 inout.second = in.second; 198 } else if (in.first > inout.first) { 199 // Don't need to do anything; inout has the values. 200 } else { // equal, or at least one is NaN. 201 inout.first = in.first; 202 inout.second = std::min (in.second, inout.second); 203 } 204 } 205 } 206 }; 207 208 /** \class MinLocNonzero 209 * \brief same as MinLoc, but don't allow zero 210 */ 211 template<class Ordinal, class ScalarType, class IndexType> 212 class MinLocNonzero : 213 public ValueTypeReductionOp<Ordinal, std::pair<ScalarType, IndexType> > { 214 public: 215 void 216 reduce (const Ordinal count, 217 const std::pair<ScalarType, IndexType> inBuffer[], 218 std::pair<ScalarType, IndexType> inoutBuffer[]) const; 219 }; 220 221 template<class Ordinal> 222 class MinLocNonzero<Ordinal, double, int> : 223 public ValueTypeReductionOp<Ordinal, std::pair<double, int> > { 224 public: 225 void reduce(const Ordinal count,const std::pair<double,int> inBuffer[],std::pair<double,int> inoutBuffer[]) const226 reduce (const Ordinal count, 227 const std::pair<double, int> inBuffer[], 228 std::pair<double, int> inoutBuffer[]) const 229 { 230 for (Ordinal ind = 0; ind < count; ++ind) { 231 const std::pair<double, int>& in = inBuffer[ind]; 232 std::pair<double, int>& inout = inoutBuffer[ind]; 233 234 if ( (in.first < inout.first && in.first != 0) || (inout.first == 0 && in.first != 0) ) { 235 inout.first = in.first; 236 inout.second = in.second; 237 } else if (in.first > inout.first) { 238 // Don't need to do anything; inout has the values. 239 } else { // equal, or at least one is NaN. 240 inout.first = in.first; 241 inout.second = std::min (in.second, inout.second); 242 } 243 } 244 } 245 }; 246 247 // Typedef used internally by TimeMonitor::summarize() and its 248 // helper functions. The map is keyed on timer label (a string). 249 // Each value is a pair: (total number of seconds over all calls to 250 // that timer, total number of calls to that timer). 251 typedef std::map<std::string, std::pair<double, int> > timer_map_t; 252 253 // static initialization 254 Teuchos::RCP<Teuchos::StackedTimer> TimeMonitor::stackedTimer_ = Teuchos::rcp(new Teuchos::StackedTimer("Teuchos::StackedTimer")); 255 TimeMonitor(Time & timer,bool reset)256 TimeMonitor::TimeMonitor (Time& timer, bool reset) 257 : PerformanceMonitorBase<Time>(timer, reset) 258 { 259 if (!isRecursiveCall()) { 260 counter().start(reset); 261 #ifdef HAVE_TEUCHOS_ADD_TIME_MONITOR_TO_STACKED_TIMER 262 if (nonnull(stackedTimer_)) 263 stackedTimer_->start(counter().name(),false); 264 #endif 265 } 266 } 267 ~TimeMonitor()268 TimeMonitor::~TimeMonitor() { 269 if (!isRecursiveCall()) { 270 counter().stop(); 271 #ifdef HAVE_TEUCHOS_ADD_TIME_MONITOR_TO_STACKED_TIMER 272 try { 273 if (nonnull(stackedTimer_)) 274 stackedTimer_->stop(counter().name(),false); 275 } 276 catch (std::runtime_error&) { 277 std::ostringstream warning; 278 warning << 279 "\n*********************************************************************\n" 280 "WARNING: Overlapping timers detected! Near: " <<counter().name()<<"\n" 281 "A TimeMonitor timer was stopped before a nested subtimer was\n" 282 "stopped. This is not allowed by the StackedTimer. This corner case\n" 283 "typically occurs if the TimeMonitor is stored in an RCP and the RCP is\n" 284 "assigned to a new timer. To disable this warning, either fix the\n" 285 "ordering of timer creation and destuction or disable the StackedTimer\n" 286 "support in the TimeMonitor by setting the StackedTimer to null\n" 287 "Example:\n" 288 " RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(\"Junk\"))));\n" 289 "///code to time \n" 290 "MM = Teuchos::null;\n" 291 "MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(\"SecondJunk\"))));\n" 292 "*********************************************************************\n"; 293 std::cout << warning.str() << std::endl; 294 Teuchos::TimeMonitor::setStackedTimer(Teuchos::null); 295 } 296 #endif 297 } 298 } 299 300 void disableTimer(const std::string & name)301 TimeMonitor::disableTimer (const std::string& name) 302 { 303 RCP<Time> timer = lookupCounter (name); 304 TEUCHOS_TEST_FOR_EXCEPTION( 305 timer == null, std::invalid_argument, 306 "TimeMonitor::disableTimer: Invalid timer \"" << name << "\""); 307 timer->disable (); 308 } 309 310 void enableTimer(const std::string & name)311 TimeMonitor::enableTimer (const std::string& name) 312 { 313 RCP<Time> timer = lookupCounter (name); 314 TEUCHOS_TEST_FOR_EXCEPTION( 315 timer == null, std::invalid_argument, 316 "TimeMonitor::enableTimer: Invalid timer \"" << name << "\""); 317 timer->enable (); 318 } 319 320 void zeroOutTimers()321 TimeMonitor::zeroOutTimers() 322 { 323 typedef std::map<std::string, RCP<Time> > map_type; 324 typedef map_type::iterator iter_type; 325 map_type& ctrs = counters (); 326 327 // In debug mode, loop first to check whether any of the timers 328 // are running, before resetting them. This ensures that this 329 // method satisfies the strong exception guarantee (either it 330 // completes normally, or there are no side effects). 331 #ifdef TEUCHOS_DEBUG 332 for (iter_type it = ctrs.begin(); it != ctrs.end(); ++it) { 333 // We throw a runtime_error rather than a logic_error, because 334 // logic_error suggests a bug in the implementation of 335 // TimeMonitor. Calling zeroOutTimers() when a timer is running 336 // is not TimeMonitor's fault. 337 TEUCHOS_TEST_FOR_EXCEPTION( 338 it->second->isRunning (), std::runtime_error, 339 "Timer \"" << it->second->name () << "\" is currently running. " 340 "You are not allowed to reset running timers."); 341 } 342 #endif // TEUCHOS_DEBUG 343 344 for (iter_type it = ctrs.begin(); it != ctrs.end(); ++it) { 345 it->second->reset (); 346 } 347 } 348 349 // An anonymous namespace is the standard way of limiting linkage of 350 // its contained routines to file scope. 351 namespace { 352 // \brief Return an "empty" local timer datum. 353 // 354 // "Empty" means the datum has zero elapsed time and zero call 355 // count. This function does not actually create a timer. 356 // 357 // \param name The timer's name. 358 std::pair<std::string, std::pair<double, int> > makeEmptyTimerDatum(const std::string & name)359 makeEmptyTimerDatum (const std::string& name) 360 { 361 return std::make_pair (name, std::make_pair (double(0), int(0))); 362 } 363 364 // \fn collectLocalTimerData 365 // \brief Collect and sort local timer data by timer names. 366 // 367 // \param localData [out] Map whose keys are the timer names, and 368 // whose value for each key is the total elapsed time (in 369 // seconds) and the call count for the timer with that name. 370 // 371 // \param localCounters [in] Timers from which to extract data. 372 // 373 // \param filter [in] Filter for timer labels. If filter is not 374 // empty, this method will only collect data for local timers 375 // whose labels begin with this string. 376 // 377 // Extract the total elapsed time and call count from each timer 378 // in the given array. Merge results for timers with duplicate 379 // labels, by summing their total elapsed times and call counts 380 // pairwise. 381 void collectLocalTimerData(timer_map_t & localData,const std::map<std::string,RCP<Time>> & localCounters,const std::string & filter="")382 collectLocalTimerData (timer_map_t& localData, 383 const std::map<std::string, RCP<Time> >& localCounters, 384 const std::string& filter="") 385 { 386 using std::make_pair; 387 typedef timer_map_t::iterator iter_t; 388 389 timer_map_t theLocalData; 390 for (std::map<std::string, RCP<Time> >::const_iterator it = localCounters.begin(); 391 it != localCounters.end(); ++it) { 392 const std::string& name = it->second->name (); 393 394 // Filter current timer name, if provided filter is nonempty. 395 // Filter string must _start_ the timer label, not just be in it. 396 const bool skipThisOne = (filter != "" && name.find (filter) != 0); 397 if (! skipThisOne) { 398 const double timing = it->second->totalElapsedTime (); 399 const int numCalls = it->second->numCalls (); 400 401 // Merge timers with duplicate labels, by summing their 402 // total elapsed times and call counts. 403 iter_t loc = theLocalData.find (name); 404 if (loc == theLocalData.end()) { 405 // Use loc as an insertion location hint. 406 theLocalData.insert (loc, make_pair (name, make_pair (timing, numCalls))); 407 } 408 else { 409 loc->second.first += timing; 410 loc->second.second += numCalls; 411 } 412 } 413 } 414 // This avoids copying the map, and also makes this method 415 // satisfy the strong exception guarantee. 416 localData.swap (theLocalData); 417 } 418 419 // \brief Locally filter out timer data with zero call counts. 420 // 421 // \param timerData [in/out] 422 void filterZeroData(timer_map_t & timerData)423 filterZeroData (timer_map_t& timerData) 424 { 425 // FIXME (mfh 15 Mar 2013) Should use std::map::erase with 426 // iterator hint, instead of rebuilding the map completely. 427 timer_map_t newTimerData; 428 for (timer_map_t::const_iterator it = timerData.begin(); 429 it != timerData.end(); ++it) { 430 if (it->second.second > 0) { 431 newTimerData[it->first] = it->second; 432 } 433 } 434 timerData.swap (newTimerData); 435 } 436 437 /// \fn collectLocalTimerDataAndNames 438 /// \brief Collect the local timer data and timer names from the timers. 439 /// 440 /// \param localTimerData [out] Timer data extracted from 441 /// localTimers. Contents on input are ignored and overwritten. 442 /// See the documentation of \c collectLocalTimerData(). 443 /// 444 /// \param localTimerNames [out] On output: names of timers 445 /// extracted from \c localTimers, in the same order as the keys 446 /// of \c localTimerData. Resized as necessary. Contents on 447 /// input are ignored and overwritten. 448 /// 449 /// \param localTimers [in] (Local) timers from which to extract data. 450 /// 451 /// \param writeZeroTimers [in] If true, do not include timers 452 /// with zero call counts in the \c localTimerData and \c 453 /// localTimerNames output. 454 /// 455 /// \param filter [in] Filter for timer labels. If filter is not 456 /// empty, this method will only collect data for local timers 457 /// whose labels begin with this string. 458 void collectLocalTimerDataAndNames(timer_map_t & localTimerData,Array<std::string> & localTimerNames,const std::map<std::string,RCP<Time>> & localTimers,const bool writeZeroTimers,const std::string & filter="")459 collectLocalTimerDataAndNames (timer_map_t& localTimerData, 460 Array<std::string>& localTimerNames, 461 const std::map<std::string, RCP<Time> >& localTimers, 462 const bool writeZeroTimers, 463 const std::string& filter="") 464 { 465 // Collect and sort local timer data by timer names. 466 collectLocalTimerData (localTimerData, localTimers, filter); 467 468 // Filter out zero data locally first. This ensures that if we 469 // are writing global stats, and if a timer name exists in the 470 // set of global names, then that timer has a nonzero call count 471 // on at least one MPI process. 472 if (! writeZeroTimers) { 473 filterZeroData (localTimerData); 474 } 475 476 // Extract the set of local timer names. The std::map keeps 477 // them sorted alphabetically. 478 localTimerNames.reserve (localTimerData.size()); 479 for (timer_map_t::const_iterator it = localTimerData.begin(); 480 it != localTimerData.end(); ++it) { 481 localTimerNames.push_back (it->first); 482 } 483 } 484 485 /// \brief Merge local timer data into global data. 486 /// 487 /// Call this method in \c summarize() only if the \c 488 /// writeGlobalStats argument is true. 489 /// 490 /// \param globalTimerData [out] Result of merging localTimerData 491 /// over the processes in the given communicator. 492 /// 493 /// \param globalTimerNames [out] Names of the timers in 494 /// globalTimerData; same as the keys of the map. 495 /// 496 /// \param localTimerData [in/out] On input: the first return 497 /// value of \c collectLocalTimerDataAndNames(). On output, if 498 /// writeZeroTimers is true, data for timers with zero call 499 /// counts will be removed. 500 /// 501 /// \param localTimerNames [in/out] On input: the second return 502 /// value of \c collectLocalTimerDataAndNames(). On output, if 503 /// writeZeroTimers is true, names of timers with zero call 504 /// counts will be removed. 505 /// 506 /// \param comm [in] Communicator over which to merge. 507 /// 508 /// \param alwaysWriteLocal [in] If true, and if the local set of 509 /// timers differs from the global set of timers (either the 510 /// union or the intersection, depending on \c setOp), Proc 0 511 /// will create corresponding timer data in localTimerData with 512 /// zero elapsed times and call counts. (This pads the output 513 /// so it fits in a tabular form.) 514 /// 515 /// \param setOp [in] If \c Intersection, compute the intersection 516 /// of all created timers over all processes in the 517 /// communicator. If \c Union, compute the union of all created 518 /// timers over all processes in the communicator. 519 void collectGlobalTimerData(timer_map_t & globalTimerData,Array<std::string> & globalTimerNames,timer_map_t & localTimerData,Array<std::string> & localTimerNames,Ptr<const Comm<int>> comm,const bool alwaysWriteLocal,const ECounterSetOp setOp)520 collectGlobalTimerData (timer_map_t& globalTimerData, 521 Array<std::string>& globalTimerNames, 522 timer_map_t& localTimerData, 523 Array<std::string>& localTimerNames, 524 Ptr<const Comm<int> > comm, 525 const bool alwaysWriteLocal, 526 const ECounterSetOp setOp) 527 { 528 // There may be some global timers that are not local timers on 529 // the calling MPI process(es). In that case, if 530 // alwaysWriteLocal is true, then we need to fill in the 531 // "missing" local timers. That will ensure that both global 532 // and local timer columns in the output table have the same 533 // number of rows. The collectLocalTimerDataAndNames() method 534 // may have already filtered out local timers with zero call 535 // counts (if its writeZeroTimers argument was false), but we 536 // won't be filtering again. Thus, any local timer data we 537 // insert here won't get filtered out. 538 // 539 // Note that calling summarize() with writeZeroTimers == false 540 // will still do what it says, even if we insert local timers 541 // with zero call counts here. 542 543 // This does the correct and inexpensive thing (just copies the 544 // timer data) if numProcs == 1. Otherwise, it initiates a 545 // communication with \f$O(\log P)\f$ messages along the 546 // critical path, where \f$P\f$ is the number of participating 547 // processes. 548 mergeCounterNames (*comm, localTimerNames, globalTimerNames, setOp); 549 550 #ifdef TEUCHOS_DEBUG 551 { 552 // Sanity check that all processes have the name number of 553 // global timer names. 554 const timer_map_t::size_type myNumGlobalNames = globalTimerNames.size(); 555 timer_map_t::size_type minNumGlobalNames = 0; 556 timer_map_t::size_type maxNumGlobalNames = 0; 557 reduceAll (*comm, REDUCE_MIN, myNumGlobalNames, 558 outArg (minNumGlobalNames)); 559 reduceAll (*comm, REDUCE_MAX, myNumGlobalNames, 560 outArg (maxNumGlobalNames)); 561 TEUCHOS_TEST_FOR_EXCEPTION(minNumGlobalNames != maxNumGlobalNames, 562 std::logic_error, "Min # global timer names = " << minNumGlobalNames 563 << " != max # global timer names = " << maxNumGlobalNames 564 << ". Please report this bug to the Teuchos developers."); 565 TEUCHOS_TEST_FOR_EXCEPTION(myNumGlobalNames != minNumGlobalNames, 566 std::logic_error, "My # global timer names = " << myNumGlobalNames 567 << " != min # global timer names = " << minNumGlobalNames 568 << ". Please report this bug to the Teuchos developers."); 569 } 570 #endif // TEUCHOS_DEBUG 571 572 // mergeCounterNames() just merges the counters' names, not 573 // their actual data. Now we need to fill globalTimerData with 574 // this process' timer data for the timers in globalTimerNames. 575 // 576 // All processes need the full list of global timers, since 577 // there may be some global timers that are not local timers. 578 // That's why mergeCounterNames() has to be an all-reduce, not 579 // just a reduction to Proc 0. 580 // 581 // Insertion optimization: if the iterator given to map::insert 582 // points right before where we want to insert, insertion is 583 // O(1). globalTimerNames is sorted, so feeding the iterator 584 // output of map::insert into the next invocation's input should 585 // make the whole insertion O(N) where N is the number of 586 // entries in globalTimerNames. 587 timer_map_t::iterator globalMapIter = globalTimerData.begin(); 588 timer_map_t::iterator localMapIter; 589 for (Array<string>::const_iterator it = globalTimerNames.begin(); 590 it != globalTimerNames.end(); ++it) { 591 const std::string& globalName = *it; 592 localMapIter = localTimerData.find (globalName); 593 594 if (localMapIter == localTimerData.end()) { 595 if (alwaysWriteLocal) { 596 // If there are some global timers that are not local 597 // timers, and if we want to print local timers, we insert 598 // a local timer datum with zero elapsed time and zero 599 // call count into localTimerData as well. This will 600 // ensure that both global and local timer columns in the 601 // output table have the same number of rows. 602 // 603 // We really only need to do this on Proc 0, which is the 604 // only process that currently may print local timers. 605 // However, we do it on all processes, just in case 606 // someone later wants to modify this function to print 607 // out local timer data for some process other than Proc 608 // 0. This extra computation won't affect the cost along 609 // the critical path, for future computations in which 610 // Proc 0 participates. 611 localMapIter = localTimerData.insert (localMapIter, makeEmptyTimerDatum (globalName)); 612 613 // Make sure the missing global name gets added to the 614 // list of local names. We'll re-sort the list of local 615 // names below. 616 localTimerNames.push_back (globalName); 617 } 618 // There's a global timer that's not a local timer. Add it 619 // to our pre-merge version of the global timer data so that 620 // we can safely merge the global timer data later. 621 globalMapIter = globalTimerData.insert (globalMapIter, makeEmptyTimerDatum (globalName)); 622 } 623 else { 624 // We have this global timer name in our local timer list. 625 // Fill in our pre-merge version of the global timer data 626 // with our local data. 627 globalMapIter = globalTimerData.insert (globalMapIter, std::make_pair (globalName, localMapIter->second)); 628 } 629 } 630 631 if (alwaysWriteLocal) { 632 // Re-sort the list of local timer names, since we may have 633 // inserted "missing" names above. 634 std::sort (localTimerNames.begin(), localTimerNames.end()); 635 } 636 637 #ifdef TEUCHOS_DEBUG 638 { 639 // Sanity check that all processes have the name number of 640 // global timers. 641 const timer_map_t::size_type myNumGlobalTimers = globalTimerData.size(); 642 timer_map_t::size_type minNumGlobalTimers = 0; 643 timer_map_t::size_type maxNumGlobalTimers = 0; 644 reduceAll (*comm, REDUCE_MIN, myNumGlobalTimers, 645 outArg (minNumGlobalTimers)); 646 reduceAll (*comm, REDUCE_MAX, myNumGlobalTimers, 647 outArg (maxNumGlobalTimers)); 648 TEUCHOS_TEST_FOR_EXCEPTION(minNumGlobalTimers != maxNumGlobalTimers, 649 std::logic_error, "Min # global timers = " << minNumGlobalTimers 650 << " != max # global timers = " << maxNumGlobalTimers 651 << ". Please report this bug to the Teuchos developers."); 652 TEUCHOS_TEST_FOR_EXCEPTION(myNumGlobalTimers != minNumGlobalTimers, 653 std::logic_error, "My # global timers = " << myNumGlobalTimers 654 << " != min # global timers = " << minNumGlobalTimers 655 << ". Please report this bug to the Teuchos developers."); 656 } 657 #endif // TEUCHOS_DEBUG 658 } 659 660 /// \brief Compute global timer statistics. 661 /// 662 /// Currently, this function computes the "MinOverProcs", 663 /// "MeanOverProcs", "MaxOverProcs", and "MeanOverCallCounts" 664 /// timings for each timer. Along with the min / max timing comes 665 /// the call count of the process who had the min / max. (If more 666 /// than one process had the min / max, then the call count on the 667 /// process with the smallest rank is reported.) 668 /// 669 /// "MeanOverProcs" is the arithmetic mean of the cumulative 670 /// timings over processes. It ignores the call counts. It 671 /// includes the mean call count, which may be fractional, and has 672 /// no particular connection to the mean timing. 673 /// 674 /// The "MeanOverCallCounts" is an arithmetic mean of timings that 675 /// accounts for call counts. Each timing is the sum over all 676 /// calls. Thus, this mean equals the sum of the timing over all 677 /// processes, divided by the sum of the call counts over all 678 /// processes for that timing. (We compute it a bit differently 679 /// to help prevent overflow.) Along with the mean timing comes 680 /// the same mean call count as mentioned above. 681 /// 682 /// Statistics may optionally be calculated in a mode that ignores 683 /// contributions from processes that either do not have a timer 684 /// or have a hard zero for that time. This is slightly more expensive 685 /// than the default mode, as it requires one additional all-reduce. 686 /// 687 /// \param[out] statData On output: Global timer statistics. See 688 /// the \c stat_map_type typedef documentation for an explanation 689 /// of the data structure. 690 /// 691 /// \param[out] statNames On output: Each value in the statData 692 /// map is a vector. That vector v has the same number of 693 /// entries as statNames. statNames[k] is the name of the 694 /// statistic (e.g., "Min", "MeanOverProcs", "Max", or 695 /// "MeanOverCallCounts") stored as v[k]. 696 /// 697 /// \param[in] comm Communicator over which to compute statistics. 698 /// 699 /// \param[in] globalTimerData Output with the same name of the 700 /// \c collectGlobalTimerData() function. That function assures 701 /// that all processes have the same keys stored in this map. 702 /// 703 /// \param[in] ignoreZeroTimers [in] Processes that either do not have 704 /// a particular timer or have zero time for a timer are not used 705 /// in calculating global statistics. 706 void computeGlobalTimerStats(stat_map_type & statData,std::vector<std::string> & statNames,Ptr<const Comm<int>> comm,const timer_map_t & globalTimerData,const bool ignoreZeroTimers)707 computeGlobalTimerStats (stat_map_type& statData, 708 std::vector<std::string>& statNames, 709 Ptr<const Comm<int> > comm, 710 const timer_map_t& globalTimerData, 711 const bool ignoreZeroTimers) 712 { 713 using Teuchos::ScalarTraits; 714 715 const int numTimers = static_cast<int> (globalTimerData.size()); 716 const int numProcs = comm->getSize(); 717 718 // Extract pre-reduction timings and call counts into a 719 // sequential array. This array will be in the same order as 720 // the global timer names are in the map. 721 Array<std::pair<double, int> > timingsAndCallCounts; 722 timingsAndCallCounts.reserve (numTimers); 723 for (timer_map_t::const_iterator it = globalTimerData.begin(); 724 it != globalTimerData.end(); ++it) { 725 timingsAndCallCounts.push_back (it->second); 726 } 727 728 // For each timer name, compute the min timing and its 729 // corresponding call count. If two processes have the same 730 // timing but different call counts, the minimum call count will 731 // be used. 732 Array<std::pair<double, int> > minTimingsAndCallCounts (numTimers); 733 if (numTimers > 0) { 734 if (ignoreZeroTimers) 735 reduceAll (*comm, MinLocNonzero<int, double, int>(), numTimers, 736 &timingsAndCallCounts[0], &minTimingsAndCallCounts[0]); 737 else 738 reduceAll (*comm, MinLoc<int, double, int>(), numTimers, 739 &timingsAndCallCounts[0], &minTimingsAndCallCounts[0]); 740 } 741 742 // For each timer name, compute the max timing and its 743 // corresponding call count. If two processes have the same 744 // timing but different call counts, the minimum call count will 745 // be used. 746 Array<std::pair<double, int> > maxTimingsAndCallCounts (numTimers); 747 if (numTimers > 0) { 748 reduceAll (*comm, MaxLoc<int, double, int>(), numTimers, 749 &timingsAndCallCounts[0], &maxTimingsAndCallCounts[0]); 750 } 751 752 // For each timer name, compute the mean-over-processes timing, 753 // the mean call count, and the mean-over-call-counts timing. 754 // The mean call count is reported as a double to allow a 755 // fractional value. 756 // 757 // Each local timing is really the total timing over all local 758 // invocations. The number of local invocations is the call 759 // count. Thus, the mean-over-call-counts timing is the sum of 760 // all the timings (over all processes), divided by the sum of 761 // all the call counts (over all processes). We compute it in a 762 // different way to over unnecessary overflow. 763 Array<double> meanOverCallCountsTimings (numTimers); 764 Array<double> meanOverProcsTimings (numTimers); 765 Array<double> meanCallCounts (numTimers); 766 Array<int> ICallThisTimer (numTimers); 767 Array<int> numProcsCallingEachTimer (numTimers); 768 { 769 // Figure out how many processors actually call each timer. 770 if (ignoreZeroTimers) { 771 for (int k = 0; k < numTimers; ++k) { 772 const double callCount = static_cast<double> (timingsAndCallCounts[k].second); 773 if (callCount > 0) ICallThisTimer[k] = 1; 774 else ICallThisTimer[k] = 0; 775 } 776 if (numTimers > 0) { 777 reduceAll (*comm, REDUCE_SUM, numTimers, &ICallThisTimer[0], 778 &numProcsCallingEachTimer[0]); 779 } 780 } 781 782 // When summing, first scale by the number of processes. This 783 // avoids unnecessary overflow, and also gives us the mean 784 // call count automatically. 785 Array<double> scaledTimings (numTimers); 786 Array<double> scaledCallCounts (numTimers); 787 const double P = static_cast<double> (numProcs); 788 789 if (ignoreZeroTimers) { 790 for (int k = 0; k < numTimers; ++k) { 791 const double timing = timingsAndCallCounts[k].first; 792 const double callCount = static_cast<double> (timingsAndCallCounts[k].second); 793 794 scaledTimings[k] = timing / numProcsCallingEachTimer[k]; 795 scaledCallCounts[k] = callCount / numProcsCallingEachTimer[k]; 796 } 797 } 798 else { 799 for (int k = 0; k < numTimers; ++k) { 800 const double timing = timingsAndCallCounts[k].first; 801 const double callCount = static_cast<double> (timingsAndCallCounts[k].second); 802 803 scaledTimings[k] = timing / P; 804 scaledCallCounts[k] = callCount / P; 805 } 806 } 807 808 if (numTimers > 0) { 809 reduceAll (*comm, REDUCE_SUM, numTimers, &scaledTimings[0], 810 &meanOverProcsTimings[0]); 811 reduceAll (*comm, REDUCE_SUM, numTimers, &scaledCallCounts[0], 812 &meanCallCounts[0]); 813 } 814 // We don't have to undo the scaling for the mean timings; 815 // just divide by the scaled call count. 816 for (int k = 0; k < numTimers; ++k) { 817 if (meanCallCounts[k] > ScalarTraits<double>::zero ()) { 818 meanOverCallCountsTimings[k] = meanOverProcsTimings[k] / meanCallCounts[k]; 819 } 820 else { 821 meanOverCallCountsTimings[k] = ScalarTraits<double>::zero (); 822 } 823 } 824 } 825 826 // Reformat the data into the map of statistics. Be sure that 827 // each value (the std::vector of (timing, call count) pairs, 828 // each entry of which is a different statistic) preserves the 829 // order of statNames. 830 statNames.resize (4); 831 statNames[0] = "MinOverProcs"; 832 statNames[1] = "MeanOverProcs"; 833 statNames[2] = "MaxOverProcs"; 834 statNames[3] = "MeanOverCallCounts"; 835 836 stat_map_type::iterator statIter = statData.end(); 837 timer_map_t::const_iterator it = globalTimerData.begin(); 838 for (int k = 0; it != globalTimerData.end(); ++k, ++it) { 839 std::vector<std::pair<double, double> > curData (4); 840 curData[0] = minTimingsAndCallCounts[k]; 841 curData[1] = std::make_pair (meanOverProcsTimings[k], meanCallCounts[k]); 842 curData[2] = maxTimingsAndCallCounts[k]; 843 curData[3] = std::make_pair (meanOverCallCountsTimings[k], meanCallCounts[k]); 844 845 // statIter gives an insertion location hint that makes each 846 // insertion O(1), since we remember the location of the last 847 // insertion. 848 statIter = statData.insert (statIter, std::make_pair (it->first, curData)); 849 } 850 } 851 852 853 /// \brief Get a default communicator appropriate for the environment. 854 /// 855 /// If Trilinos was configured with MPI support, and if MPI has 856 /// been initialized, return a wrapped MPI_COMM_WORLD. If 857 /// Trilinos was configured with MPI support, and if MPI has not 858 /// yet been initialized, return a serial communicator (containing 859 /// one process). If Trilinos was <i>not</i> configured with MPI 860 /// support, return a serial communicator. 861 /// 862 /// Rationale: Callers may or may not have initialized MPI before 863 /// calling this method. Just because they built with MPI, 864 /// doesn't mean they want to use MPI. It's not my responsibility 865 /// to initialize MPI for them, and I don't have the context I 866 /// need in order to do so anyway. Thus, if Trilinos was built 867 /// with MPI and MPI has not yet been initialized, this method 868 /// returns a "serial" communicator. 869 RCP<const Comm<int> > getDefaultComm()870 getDefaultComm () 871 { 872 // The default communicator. If Trilinos was built with MPI 873 // enabled, this should be MPI_COMM_WORLD. (If MPI has not yet 874 // been initialized, it's not valid to use the communicator!) 875 // Otherwise, this should be a "serial" (no MPI, one "process") 876 // communicator. 877 RCP<const Comm<int> > comm = DefaultComm<int>::getComm (); 878 879 #ifdef HAVE_MPI 880 { 881 int mpiHasBeenStarted = 0; 882 MPI_Initialized (&mpiHasBeenStarted); 883 if (! mpiHasBeenStarted) { 884 // Make pComm a new "serial communicator." 885 comm = rcp_implicit_cast<const Comm<int> > (rcp (new SerialComm<int> ())); 886 } 887 } 888 #endif // HAVE_MPI 889 return comm; 890 } 891 892 } // namespace (anonymous) 893 894 895 void computeGlobalTimerStatistics(stat_map_type & statData,std::vector<std::string> & statNames,Ptr<const Comm<int>> comm,const ECounterSetOp setOp,const std::string & filter)896 TimeMonitor::computeGlobalTimerStatistics (stat_map_type& statData, 897 std::vector<std::string>& statNames, 898 Ptr<const Comm<int> > comm, 899 const ECounterSetOp setOp, 900 const std::string& filter) 901 { 902 // Collect local timer data and names. Filter out timers with 903 // zero call counts if writeZeroTimers is false. Also, apply the 904 // timer label filter at this point, so we don't have to compute 905 // statistics on timers we don't want to display anyway. 906 timer_map_t localTimerData; 907 Array<std::string> localTimerNames; 908 const bool writeZeroTimers = false; 909 collectLocalTimerDataAndNames (localTimerData, localTimerNames, 910 counters(), writeZeroTimers, filter); 911 // Merge the local timer data and names into global timer data and 912 // names. 913 timer_map_t globalTimerData; 914 Array<std::string> globalTimerNames; 915 const bool alwaysWriteLocal = false; 916 collectGlobalTimerData (globalTimerData, globalTimerNames, 917 localTimerData, localTimerNames, 918 comm, alwaysWriteLocal, setOp); 919 // Compute statistics on the data. 920 computeGlobalTimerStats (statData, statNames, comm, globalTimerData, false); 921 } 922 923 924 void summarize(Ptr<const Comm<int>> comm,std::ostream & out,const bool alwaysWriteLocal,const bool writeGlobalStats,const bool writeZeroTimers,const ECounterSetOp setOp,const std::string & filter,const bool ignoreZeroTimers)925 TimeMonitor::summarize (Ptr<const Comm<int> > comm, 926 std::ostream& out, 927 const bool alwaysWriteLocal, 928 const bool writeGlobalStats, 929 const bool writeZeroTimers, 930 const ECounterSetOp setOp, 931 const std::string& filter, 932 const bool ignoreZeroTimers) 933 { 934 // 935 // We can't just call computeGlobalTimerStatistics(), since 936 // summarize() has different options that affect whether global 937 // statistics are computed and printed. 938 // 939 const int numProcs = comm->getSize(); 940 const int myRank = comm->getRank(); 941 942 // Collect local timer data and names. Filter out timers with 943 // zero call counts if writeZeroTimers is false. Also, apply the 944 // timer label filter at this point, so we don't have to compute 945 // statistics on timers we don't want to display anyway. 946 timer_map_t localTimerData; 947 Array<std::string> localTimerNames; 948 collectLocalTimerDataAndNames (localTimerData, localTimerNames, 949 counters(), writeZeroTimers, filter); 950 951 // If we're computing global statistics, merge the local timer 952 // data and names into global timer data and names, and compute 953 // global timer statistics. Otherwise, leave the global data 954 // empty. 955 timer_map_t globalTimerData; 956 Array<std::string> globalTimerNames; 957 stat_map_type statData; 958 std::vector<std::string> statNames; 959 if (writeGlobalStats) { 960 collectGlobalTimerData (globalTimerData, globalTimerNames, 961 localTimerData, localTimerNames, 962 comm, alwaysWriteLocal, setOp); 963 // Compute statistics on the data, but only if the communicator 964 // contains more than one process. Otherwise, statistics don't 965 // make sense and we don't print them (see below). 966 if (numProcs > 1) { 967 computeGlobalTimerStats (statData, statNames, comm, globalTimerData, ignoreZeroTimers); 968 } 969 } 970 971 // Precision of floating-point numbers in the table. 972 const int precision = format().precision(); 973 const std::ios_base::fmtflags& flags = out.flags(); 974 975 // All columns of the table, in order. 976 Array<TableColumn> tableColumns; 977 978 // Labels of all the columns of the table. 979 // We will append to this when we add each column. 980 Array<std::string> titles; 981 982 // Widths (in number of characters) of each column. 983 // We will append to this when we add each column. 984 Array<int> columnWidths; 985 986 // Table column containing all timer names. If writeGlobalStats 987 // is true, we use the global timer names, otherwise we use the 988 // local timer names. We build the table on all processes 989 // redundantly, but only print on Rank 0. 990 { 991 titles.append ("Timer Name"); 992 993 // The column labels depend on whether we are computing global statistics. 994 TableColumn nameCol (writeGlobalStats ? globalTimerNames : localTimerNames); 995 tableColumns.append (nameCol); 996 997 // Each column is as wide as it needs to be to hold both its 998 // title and all of the column data. This column's title is the 999 // current last entry of the titles array. 1000 columnWidths.append (format().computeRequiredColumnWidth (titles.back(), nameCol)); 1001 } 1002 1003 // Table column containing local timer stats, if applicable. We 1004 // only write local stats if asked, only on MPI Proc 0, and only 1005 // if there is more than one MPI process in the communicator 1006 // (otherwise local stats == global stats, so we just print the 1007 // global stats). In this case, we've padded the local data on 1008 // Proc 0 if necessary to match the global timer list, so that the 1009 // columns have the same number of rows. 1010 if (alwaysWriteLocal && numProcs > 1 && myRank == 0) { 1011 titles.append ("Local time (num calls)"); 1012 1013 // Copy local timer data out of the array-of-structs into 1014 // separate arrays, for display in the table. 1015 Array<double> localTimings; 1016 Array<double> localNumCalls; 1017 for (timer_map_t::const_iterator it = localTimerData.begin(); 1018 it != localTimerData.end(); ++it) { 1019 localTimings.push_back (it->second.first); 1020 localNumCalls.push_back (static_cast<double> (it->second.second)); 1021 } 1022 TableColumn timeAndCalls (localTimings, localNumCalls, precision, flags, true); 1023 tableColumns.append (timeAndCalls); 1024 columnWidths.append (format().computeRequiredColumnWidth (titles.back(), timeAndCalls)); 1025 } 1026 1027 if (writeGlobalStats) { 1028 // If there's only 1 process in the communicator, don't display 1029 // statistics; statistics don't make sense in that case. Just 1030 // display the timings and call counts. If there's more than 1 1031 // process, do display statistics. 1032 if (numProcs == 1) { 1033 // Extract timings and the call counts from globalTimerData. 1034 Array<double> globalTimings; 1035 Array<double> globalNumCalls; 1036 for (timer_map_t::const_iterator it = globalTimerData.begin(); 1037 it != globalTimerData.end(); ++it) { 1038 globalTimings.push_back (it->second.first); 1039 globalNumCalls.push_back (static_cast<double> (it->second.second)); 1040 } 1041 // Print the table column. 1042 titles.append ("Global time (num calls)"); 1043 TableColumn timeAndCalls (globalTimings, globalNumCalls, precision, flags, true); 1044 tableColumns.append (timeAndCalls); 1045 columnWidths.append (format().computeRequiredColumnWidth (titles.back(), timeAndCalls)); 1046 } 1047 else { // numProcs > 1 1048 // Print a table column for each statistic. statNames and 1049 // each value in statData use the same ordering, so we can 1050 // iterate over valid indices of statNames to display the 1051 // statistics in the right order. 1052 const timer_map_t::size_type numGlobalTimers = globalTimerData.size(); 1053 for (std::vector<std::string>::size_type statInd = 0; statInd < statNames.size(); ++statInd) { 1054 // Extract lists of timings and their call counts for the 1055 // current statistic. 1056 Array<double> statTimings (numGlobalTimers); 1057 Array<double> statCallCounts (numGlobalTimers); 1058 stat_map_type::const_iterator it = statData.begin(); 1059 for (int k = 0; it != statData.end(); ++it, ++k) { 1060 statTimings[k] = (it->second[statInd]).first; 1061 statCallCounts[k] = (it->second[statInd]).second; 1062 } 1063 // Print the table column. 1064 const std::string& statisticName = statNames[statInd]; 1065 const std::string titleString = statisticName; 1066 titles.append (titleString); 1067 TableColumn timeAndCalls (statTimings, statCallCounts, precision, flags, true); 1068 tableColumns.append (timeAndCalls); 1069 columnWidths.append (format().computeRequiredColumnWidth (titles.back(), timeAndCalls)); 1070 } 1071 } 1072 } 1073 1074 // Print the whole table to the given output stream on MPI Rank 0. 1075 format().setColumnWidths (columnWidths); 1076 if (myRank == 0) { 1077 std::ostringstream theTitle; 1078 theTitle << "TimeMonitor results over " << numProcs << " processor" 1079 << (numProcs > 1 ? "s" : ""); 1080 format().writeWholeTable (out, theTitle.str(), titles, tableColumns); 1081 } 1082 } 1083 1084 void summarize(std::ostream & out,const bool alwaysWriteLocal,const bool writeGlobalStats,const bool writeZeroTimers,const ECounterSetOp setOp,const std::string & filter,const bool ignoreZeroTimers)1085 TimeMonitor::summarize (std::ostream &out, 1086 const bool alwaysWriteLocal, 1087 const bool writeGlobalStats, 1088 const bool writeZeroTimers, 1089 const ECounterSetOp setOp, 1090 const std::string& filter, 1091 const bool ignoreZeroTimers) 1092 { 1093 // The default communicator. If Trilinos was built with MPI 1094 // enabled, this should be MPI_COMM_WORLD. Otherwise, this should 1095 // be a "serial" (no MPI, one "process") communicator. 1096 RCP<const Comm<int> > comm = getDefaultComm(); 1097 1098 summarize (comm.ptr(), out, alwaysWriteLocal, 1099 writeGlobalStats, writeZeroTimers, setOp, filter, ignoreZeroTimers); 1100 } 1101 1102 void computeGlobalTimerStatistics(stat_map_type & statData,std::vector<std::string> & statNames,const ECounterSetOp setOp,const std::string & filter)1103 TimeMonitor::computeGlobalTimerStatistics (stat_map_type& statData, 1104 std::vector<std::string>& statNames, 1105 const ECounterSetOp setOp, 1106 const std::string& filter) 1107 { 1108 // The default communicator. If Trilinos was built with MPI 1109 // enabled, this should be MPI_COMM_WORLD. Otherwise, this should 1110 // be a "serial" (no MPI, one "process") communicator. 1111 RCP<const Comm<int> > comm = getDefaultComm(); 1112 1113 computeGlobalTimerStatistics (statData, statNames, comm.ptr(), setOp, filter); 1114 } 1115 SyncTimeMonitor(Time & timer,Ptr<const Comm<int>> comm,bool reset)1116 SyncTimeMonitor::SyncTimeMonitor(Time& timer, Ptr<const Comm<int> > comm, bool reset) 1117 : TimeMonitor(timer, reset), comm_(comm) 1118 { } 1119 ~SyncTimeMonitor()1120 SyncTimeMonitor::~SyncTimeMonitor() { 1121 comm_->barrier(); 1122 } 1123 1124 1125 namespace { 1126 /// \brief Quote the given string for YAML output. 1127 /// 1128 /// TimeMonitor allows users to provide arbitrary strings as timer 1129 /// labels. It also allows developers to do the same for 1130 /// statistics labels. Since YAML has a particular syntax in 1131 /// which certain characters indicate structure, and since we 1132 /// include timer labels in YAML output, we have to modify timer 1133 /// labels slightly in order for them not to violate YAML 1134 /// requirements. 1135 /// 1136 /// We begin by quoting the string (if not already quoted) if it 1137 /// contains a colon. This is because YAML separates each key 1138 /// from its value in a key: value pair using a colon. That means 1139 /// that if the key itself contains a colon, we need to quote the 1140 /// whole key (which we do using double quotes). We use double 1141 /// quotes since YAML allows double-quoted strings to contain 1142 /// anything, vs. just printable characters. 1143 /// 1144 /// We also quote the string if any characters in it need 1145 /// escaping. For now, we escape double quotes (not counting 1146 /// those that quote whole string) and backslashes. For an 1147 /// example of quoting double quotes in a string, see Example 5.13 1148 /// in the YAML 1.2 spec. 1149 std::string quoteLabelForYaml(const std::string & label)1150 quoteLabelForYaml (const std::string& label) 1151 { 1152 // YAML allows empty keys in key: value pairs. See Section 7.2 1153 // of the YAML 1.2 spec. We thus let an empty label pass 1154 // through without quoting or other special treatment. 1155 if (label.empty ()) { 1156 return label; 1157 } 1158 1159 // Check whether the label is already quoted. If so, we don't 1160 // need to quote it again. However, we do need to quote any 1161 // quote symbols in the string inside the outer quotes. 1162 const bool alreadyQuoted = label.size () >= 2 && 1163 label[0] == '"' && label[label.size() - 1] == '"'; 1164 1165 // We need to quote if there are any colons or (inner) quotes in 1166 // the string. We'll determine this as we read through the 1167 // string and escape any characters that need escaping. 1168 bool needToQuote = false; 1169 1170 std::string out; // To fill with the return value 1171 out.reserve (label.size ()); 1172 1173 const size_t startPos = alreadyQuoted ? 1 : 0; 1174 const size_t endPos = alreadyQuoted ? label.size () - 1 : label.size (); 1175 for (size_t i = startPos; i < endPos; ++i) { 1176 const char c = label[i]; 1177 if (c == '"' || c == '\\') { 1178 out.push_back ('\\'); // Escape the quote or backslash. 1179 needToQuote = true; 1180 } 1181 else if (c == ':') { 1182 needToQuote = true; 1183 } 1184 out.push_back (c); 1185 } 1186 1187 if (needToQuote || alreadyQuoted) { 1188 // If the input string was already quoted, then out doesn't 1189 // include its quotes, so we have to add them back in. 1190 return "\"" + out + "\""; 1191 } 1192 else { 1193 return out; 1194 } 1195 } 1196 1197 } // namespace (anonymous) 1198 1199 1200 void TimeMonitor:: summarizeToYaml(Ptr<const Comm<int>> comm,std::ostream & out,const ETimeMonitorYamlFormat yamlStyle,const std::string & filter)1201 summarizeToYaml (Ptr<const Comm<int> > comm, 1202 std::ostream &out, 1203 const ETimeMonitorYamlFormat yamlStyle, 1204 const std::string& filter) 1205 { 1206 using Teuchos::FancyOStream; 1207 using Teuchos::fancyOStream; 1208 using Teuchos::getFancyOStream; 1209 using Teuchos::OSTab; 1210 using Teuchos::RCP; 1211 using Teuchos::rcpFromRef; 1212 using std::endl; 1213 typedef std::vector<std::string>::size_type size_type; 1214 1215 const bool compact = (yamlStyle == YAML_FORMAT_COMPACT); 1216 1217 // const bool writeGlobalStats = true; 1218 // const bool writeZeroTimers = true; 1219 // const bool alwaysWriteLocal = false; 1220 const ECounterSetOp setOp = Intersection; 1221 1222 stat_map_type statData; 1223 std::vector<std::string> statNames; 1224 computeGlobalTimerStatistics (statData, statNames, comm, setOp, filter); 1225 1226 const int numProcs = comm->getSize(); 1227 1228 // HACK (mfh 20 Aug 2012) For some reason, creating OSTab with "- 1229 // " as the line prefix does not work, else I would prefer that 1230 // method for printing each line of a YAML block sequence (see 1231 // Section 8.2.1 of the YAML 1.2 spec). 1232 // 1233 // Also, I have to set the tab indent string here, rather than in 1234 // OSTab's constructor. This is because line prefix (which for 1235 // some reason is what OSTab's constructor takes, rather than tab 1236 // indent string) means something different from tab indent 1237 // string, and turning on the line prefix prints all sorts of 1238 // things including "|" for some reason. 1239 RCP<FancyOStream> pfout = getFancyOStream (rcpFromRef (out)); 1240 pfout->setTabIndentStr (" "); 1241 FancyOStream& fout = *pfout; 1242 1243 fout << "# Teuchos::TimeMonitor report" << endl 1244 << "---" << endl; 1245 1246 // mfh 19 Aug 2012: An important goal of our chosen output format 1247 // was to minimize the nesting depth. We have managed to keep the 1248 // nesting depth to 3, which is the limit that the current version 1249 // of PylotDB imposes for its YAML input. 1250 1251 // Outermost level is a dictionary. (Individual entries of a 1252 // dictionary do _not_ begin with "- ".) We always print the 1253 // outermost level in standard style, not flow style, for better 1254 // readability. We begin the outermost level with metadata. 1255 fout << "Output mode: " << (compact ? "compact" : "spacious") << endl 1256 << "Number of processes: " << numProcs << endl 1257 << "Time unit: s" << endl; 1258 // For a key: value pair where the value is a sequence or 1259 // dictionary on the following line, YAML requires a space after 1260 // the colon. 1261 fout << "Statistics collected: "; 1262 // Print list of the names of all the statistics we collected. 1263 if (compact) { 1264 fout << " ["; 1265 for (size_type i = 0; i < statNames.size (); ++i) { 1266 fout << quoteLabelForYaml (statNames[i]); 1267 if (i + 1 < statNames.size ()) { 1268 fout << ", "; 1269 } 1270 } 1271 fout << "]" << endl; 1272 } 1273 else { 1274 fout << endl; 1275 OSTab tab1 (pfout); 1276 for (size_type i = 0; i < statNames.size (); ++i) { 1277 fout << "- " << quoteLabelForYaml (statNames[i]) << endl; 1278 } 1279 } 1280 1281 // Print the list of timer names. 1282 // 1283 // It might be nicer instead to print a map from timer name to all 1284 // of its data, but keeping the maximum nesting depth small 1285 // ensures better compatibility with different parsing tools. 1286 fout << "Timer names: "; 1287 if (compact) { 1288 fout << " ["; 1289 size_type ind = 0; 1290 for (stat_map_type::const_iterator it = statData.begin(); 1291 it != statData.end(); ++it, ++ind) { 1292 fout << quoteLabelForYaml (it->first); 1293 if (ind + 1 < statData.size ()) { 1294 fout << ", "; 1295 } 1296 } 1297 fout << "]" << endl; 1298 } 1299 else { 1300 fout << endl; 1301 OSTab tab1 (pfout); 1302 for (stat_map_type::const_iterator it = statData.begin(); 1303 it != statData.end(); ++it) { 1304 fout << "- " << quoteLabelForYaml (it->first) << endl; 1305 } 1306 } 1307 1308 // Print times for each timer, as a map from statistic name to its time. 1309 fout << "Total times: "; 1310 if (compact) { 1311 fout << " {"; 1312 size_type outerInd = 0; 1313 for (stat_map_type::const_iterator outerIter = statData.begin(); 1314 outerIter != statData.end(); ++outerIter, ++outerInd) { 1315 // Print timer name. 1316 fout << quoteLabelForYaml (outerIter->first) << ": "; 1317 // Print that timer's data. 1318 const std::vector<std::pair<double, double> >& curData = outerIter->second; 1319 fout << "{"; 1320 for (size_type innerInd = 0; innerInd < curData.size (); ++innerInd) { 1321 fout << quoteLabelForYaml (statNames[innerInd]) << ": " 1322 << curData[innerInd].first; 1323 if (innerInd + 1 < curData.size ()) { 1324 fout << ", "; 1325 } 1326 } 1327 fout << "}"; 1328 if (outerInd + 1 < statData.size ()) { 1329 fout << ", "; 1330 } 1331 } 1332 fout << "}" << endl; 1333 } 1334 else { 1335 fout << endl; 1336 OSTab tab1 (pfout); 1337 size_type outerInd = 0; 1338 for (stat_map_type::const_iterator outerIter = statData.begin(); 1339 outerIter != statData.end(); ++outerIter, ++outerInd) { 1340 // Print timer name. 1341 fout << quoteLabelForYaml (outerIter->first) << ": " << endl; 1342 // Print that timer's data. 1343 OSTab tab2 (pfout); 1344 const std::vector<std::pair<double, double> >& curData = outerIter->second; 1345 for (size_type innerInd = 0; innerInd < curData.size (); ++innerInd) { 1346 fout << quoteLabelForYaml (statNames[innerInd]) << ": " 1347 << curData[innerInd].first << endl; 1348 } 1349 } 1350 } 1351 1352 // Print call counts for each timer, for each statistic name. 1353 fout << "Call counts:"; 1354 if (compact) { 1355 fout << " {"; 1356 size_type outerInd = 0; 1357 for (stat_map_type::const_iterator outerIter = statData.begin(); 1358 outerIter != statData.end(); ++outerIter, ++outerInd) { 1359 // Print timer name. 1360 fout << quoteLabelForYaml (outerIter->first) << ": "; 1361 // Print that timer's data. 1362 const std::vector<std::pair<double, double> >& curData = outerIter->second; 1363 fout << "{"; 1364 for (size_type innerInd = 0; innerInd < curData.size (); ++innerInd) { 1365 fout << quoteLabelForYaml (statNames[innerInd]) << ": " 1366 << curData[innerInd].second; 1367 if (innerInd + 1 < curData.size ()) { 1368 fout << ", "; 1369 } 1370 } 1371 fout << "}"; 1372 if (outerInd + 1 < statData.size ()) { 1373 fout << ", "; 1374 } 1375 } 1376 fout << "}" << endl; 1377 } 1378 else { 1379 fout << endl; 1380 OSTab tab1 (pfout); 1381 size_type outerInd = 0; 1382 for (stat_map_type::const_iterator outerIter = statData.begin(); 1383 outerIter != statData.end(); ++outerIter, ++outerInd) { 1384 // Print timer name. 1385 fout << quoteLabelForYaml (outerIter->first) << ": " << endl; 1386 // Print that timer's data. 1387 OSTab tab2 (pfout); 1388 const std::vector<std::pair<double, double> >& curData = outerIter->second; 1389 for (size_type innerInd = 0; innerInd < curData.size (); ++innerInd) { 1390 fout << quoteLabelForYaml (statNames[innerInd]) << ": " 1391 << curData[innerInd].second << endl; 1392 } 1393 } 1394 } 1395 } 1396 1397 void TimeMonitor:: summarizeToYaml(std::ostream & out,const ETimeMonitorYamlFormat yamlStyle,const std::string & filter)1398 summarizeToYaml (std::ostream &out, 1399 const ETimeMonitorYamlFormat yamlStyle, 1400 const std::string& filter) 1401 { 1402 // The default communicator. If Trilinos was built with MPI 1403 // enabled, this should be MPI_COMM_WORLD. Otherwise, this should 1404 // be a "serial" (no MPI, one "process") communicator. 1405 RCP<const Comm<int> > comm = getDefaultComm (); 1406 1407 summarizeToYaml (comm.ptr (), out, yamlStyle, filter); 1408 } 1409 1410 // Default value is false. We'll set to true once 1411 // setReportParameters() completes successfully. 1412 bool TimeMonitor::setParams_ = false; 1413 1414 // We have to declare all of these here in order to avoid linker errors. 1415 TimeMonitor::ETimeMonitorReportFormat TimeMonitor::reportFormat_ = TimeMonitor::REPORT_FORMAT_TABLE; 1416 TimeMonitor::ETimeMonitorYamlFormat TimeMonitor::yamlStyle_ = TimeMonitor::YAML_FORMAT_SPACIOUS; 1417 ECounterSetOp TimeMonitor::setOp_ = Intersection; 1418 bool TimeMonitor::alwaysWriteLocal_ = false; 1419 bool TimeMonitor::writeGlobalStats_ = true; 1420 bool TimeMonitor::writeZeroTimers_ = true; 1421 1422 void setReportFormatParameter(ParameterList & plist)1423 TimeMonitor::setReportFormatParameter (ParameterList& plist) 1424 { 1425 const std::string name ("Report format"); 1426 const std::string defaultValue ("Table"); 1427 const std::string docString ("Output format for report of timer statistics"); 1428 Array<std::string> strings; 1429 Array<std::string> docs; 1430 Array<ETimeMonitorReportFormat> values; 1431 1432 strings.push_back ("YAML"); 1433 docs.push_back ("YAML (see yaml.org) format"); 1434 values.push_back (REPORT_FORMAT_YAML); 1435 strings.push_back ("Table"); 1436 docs.push_back ("Tabular format via Teuchos::TableFormat"); 1437 values.push_back (REPORT_FORMAT_TABLE); 1438 1439 setStringToIntegralParameter<ETimeMonitorReportFormat> (name, defaultValue, 1440 docString, 1441 strings (), docs (), 1442 values (), &plist); 1443 } 1444 1445 void setYamlFormatParameter(ParameterList & plist)1446 TimeMonitor::setYamlFormatParameter (ParameterList& plist) 1447 { 1448 const std::string name ("YAML style"); 1449 const std::string defaultValue ("spacious"); 1450 const std::string docString ("YAML-specific output format"); 1451 Array<std::string> strings; 1452 Array<std::string> docs; 1453 Array<ETimeMonitorYamlFormat> values; 1454 1455 strings.push_back ("compact"); 1456 docs.push_back ("Compact format: use \"flow style\" (see YAML 1.2 spec at " 1457 "yaml.org) for most sequences except the outermost sequence"); 1458 values.push_back (YAML_FORMAT_COMPACT); 1459 1460 strings.push_back ("spacious"); 1461 docs.push_back ("Spacious format: avoid flow style"); 1462 values.push_back (YAML_FORMAT_SPACIOUS); 1463 1464 setStringToIntegralParameter<ETimeMonitorYamlFormat> (name, defaultValue, 1465 docString, 1466 strings (), docs (), 1467 values (), &plist); 1468 } 1469 1470 void setSetOpParameter(ParameterList & plist)1471 TimeMonitor::setSetOpParameter (ParameterList& plist) 1472 { 1473 const std::string name ("How to merge timer sets"); 1474 const std::string defaultValue ("Intersection"); 1475 const std::string docString ("How to merge differing sets of timers " 1476 "across processes"); 1477 Array<std::string> strings; 1478 Array<std::string> docs; 1479 Array<ECounterSetOp> values; 1480 1481 strings.push_back ("Intersection"); 1482 docs.push_back ("Compute intersection of timer sets over processes"); 1483 values.push_back (Intersection); 1484 strings.push_back ("Union"); 1485 docs.push_back ("Compute union of timer sets over processes"); 1486 values.push_back (Union); 1487 1488 setStringToIntegralParameter<ECounterSetOp> (name, defaultValue, docString, 1489 strings (), docs (), values (), 1490 &plist); 1491 } 1492 1493 void setStackedTimer(const Teuchos::RCP<Teuchos::StackedTimer> & t)1494 TimeMonitor::setStackedTimer(const Teuchos::RCP<Teuchos::StackedTimer>& t) 1495 { 1496 stackedTimer_ = t; 1497 } 1498 1499 Teuchos::RCP<Teuchos::StackedTimer> getStackedTimer()1500 TimeMonitor::getStackedTimer() 1501 { 1502 return stackedTimer_; 1503 } 1504 1505 RCP<const ParameterList> getValidReportParameters()1506 TimeMonitor::getValidReportParameters () 1507 { 1508 // Our implementation favors recomputation over persistent 1509 // storage. That is, we simply recreate the list every time we 1510 // need it. 1511 RCP<ParameterList> plist = parameterList ("TimeMonitor::report"); 1512 1513 const bool alwaysWriteLocal = false; 1514 const bool writeGlobalStats = true; 1515 const bool writeZeroTimers = true; 1516 1517 setReportFormatParameter (*plist); 1518 setYamlFormatParameter (*plist); 1519 setSetOpParameter (*plist); 1520 plist->set ("alwaysWriteLocal", alwaysWriteLocal, 1521 "Always output local timers' values on Proc 0"); 1522 plist->set ("writeGlobalStats", writeGlobalStats, "Always output global " 1523 "statistics, even if there is only one process in the " 1524 "communicator"); 1525 plist->set ("writeZeroTimers", writeZeroTimers, "Generate output for " 1526 "timers that have never been called"); 1527 1528 return rcp_const_cast<const ParameterList> (plist); 1529 } 1530 1531 void setReportParameters(const RCP<ParameterList> & params)1532 TimeMonitor::setReportParameters (const RCP<ParameterList>& params) 1533 { 1534 ETimeMonitorReportFormat reportFormat = REPORT_FORMAT_TABLE; 1535 ETimeMonitorYamlFormat yamlStyle = YAML_FORMAT_SPACIOUS; 1536 ECounterSetOp setOp = Intersection; 1537 bool alwaysWriteLocal = false; 1538 bool writeGlobalStats = true; 1539 bool writeZeroTimers = true; 1540 1541 if (params.is_null ()) { 1542 // If we've set parameters before, leave their current values. 1543 // Otherwise, set defaults (below). 1544 if (setParams_) { 1545 return; 1546 } 1547 } 1548 else { // params is nonnull. Let's read it! 1549 params->validateParametersAndSetDefaults (*getValidReportParameters ()); 1550 1551 reportFormat = getIntegralValue<ETimeMonitorReportFormat> (*params, "Report format"); 1552 yamlStyle = getIntegralValue<ETimeMonitorYamlFormat> (*params, "YAML style"); 1553 setOp = getIntegralValue<ECounterSetOp> (*params, "How to merge timer sets"); 1554 alwaysWriteLocal = params->get<bool> ("alwaysWriteLocal"); 1555 writeGlobalStats = params->get<bool> ("writeGlobalStats"); 1556 writeZeroTimers = params->get<bool> ("writeZeroTimers"); 1557 } 1558 // Defer setting state until here, to ensure the strong exception 1559 // guarantee for this method (either it throws with no externally 1560 // visible state changes, or it returns normally). 1561 reportFormat_ = reportFormat; 1562 yamlStyle_ = yamlStyle; 1563 setOp_ = setOp; 1564 alwaysWriteLocal_ = alwaysWriteLocal; 1565 writeGlobalStats_ = writeGlobalStats; 1566 writeZeroTimers_ = writeZeroTimers; 1567 1568 setParams_ = true; // Yay, we successfully set parameters! 1569 } 1570 1571 void report(Ptr<const Comm<int>> comm,std::ostream & out,const std::string & filter,const RCP<ParameterList> & params)1572 TimeMonitor::report (Ptr<const Comm<int> > comm, 1573 std::ostream& out, 1574 const std::string& filter, 1575 const RCP<ParameterList>& params) 1576 { 1577 setReportParameters (params); 1578 1579 if (reportFormat_ == REPORT_FORMAT_YAML) { 1580 summarizeToYaml (comm, out, yamlStyle_, filter); 1581 } 1582 else if (reportFormat_ == REPORT_FORMAT_TABLE) { 1583 summarize (comm, out, alwaysWriteLocal_, writeGlobalStats_, 1584 writeZeroTimers_, setOp_, filter); 1585 } 1586 else { 1587 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "TimeMonitor::report: " 1588 "Invalid report format. This should never happen; ParameterList " 1589 "validation should have caught this. Please report this bug to the " 1590 "Teuchos developers."); 1591 } 1592 } 1593 1594 void report(Ptr<const Comm<int>> comm,std::ostream & out,const RCP<ParameterList> & params)1595 TimeMonitor::report (Ptr<const Comm<int> > comm, 1596 std::ostream& out, 1597 const RCP<ParameterList>& params) 1598 { 1599 report (comm, out, "", params); 1600 } 1601 1602 void report(std::ostream & out,const std::string & filter,const RCP<ParameterList> & params)1603 TimeMonitor::report (std::ostream& out, 1604 const std::string& filter, 1605 const RCP<ParameterList>& params) 1606 { 1607 RCP<const Comm<int> > comm = getDefaultComm (); 1608 report (comm.ptr (), out, filter, params); 1609 } 1610 1611 void report(std::ostream & out,const RCP<ParameterList> & params)1612 TimeMonitor::report (std::ostream& out, 1613 const RCP<ParameterList>& params) 1614 { 1615 RCP<const Comm<int> > comm = getDefaultComm (); 1616 report (comm.ptr (), out, "", params); 1617 } 1618 1619 } // namespace Teuchos 1620