spirit/workbench/measure.hpp

// Copyright David Abrahams, Matthias Troyer, Michael Gauckler
// 2005. Distributed under the Boost Software License, Version
// 1.0. (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#if !defined(BOOST_SPIRIT_TEST_BENCHMARK_HPP)
#define BOOST_SPIRIT_TEST_BENCHMARK_HPP

#ifdef _MSC_VER
// inline aggressively
# pragma inline_recursion(on) // turn on inline recursion
# pragma inline_depth(255)    // max inline depth
# define _SECURE_SCL 0
#endif

#include "high_resolution_timer.hpp"
#include <iostream>
#include <cstring>
#include <boost/preprocessor/seq/for_each.hpp>
#include <boost/preprocessor/stringize.hpp>

namespace test
{
    // This value is required to ensure that a smart compiler's dead
    // code elimination doesn't optimize away anything we're testing.
    // We'll use it to compute the return code of the executable to make
    // sure it's needed.
    int live_code;

    // Call objects of the given Accumulator type repeatedly
    template <class Accumulator>
    void hammer(long const repeats)
    {
        // Strategy: because the sum in an accumulator after each call
        // depends on the previous value of the sum, the CPU's pipeline
        // might be stalled while waiting for the previous addition to
        // complete.  Therefore, we allocate an array of accumulators,
        // and update them in sequence, so that there's no dependency
        // between adjacent addition operations.
        //
        // Additionally, if there were only one accumulator, the
        // compiler or CPU might decide to update the value in a
        // register rather that writing it back to memory.  we want each
        // operation to at least update the L1 cache.  *** Note: This
        // concern is specific to the particular application at which
        // we're targeting the test. ***

        // This has to be at least as large as the number of
        // simultaneous accumulations that can be executing in the
        // compiler pipeline.  A safe number here is larger than the
        // machine's maximum pipeline depth. If you want to test the L2
        // or L3 cache, or main memory, you can increase the size of
        // this array.  1024 is an upper limit on the pipeline depth of
        // current vector machines.

        const std::size_t number_of_accumulators = 1024;
        live_code = 0; // reset to zero

        Accumulator a[number_of_accumulators];

        for (long iteration = 0; iteration < repeats; ++iteration)
        {
            for (Accumulator* ap = a;  ap < a + number_of_accumulators; ++ap)
            {
                ap->benchmark();
            }
        }

        // Accumulate all the partial sums to avoid dead code
        // elimination.
        for (Accumulator* ap = a; ap < a + number_of_accumulators; ++ap)
        {
            live_code += ap->val;
        }
    }

    // Measure the time required to hammer accumulators of the given type
    template <class Accumulator>
    double measure(long const repeats)
    {
        // Hammer accumulators a couple of times to ensure the
        // instruction cache is full of our test code, and that we don't
        // measure the cost of a page fault for accessing the data page
        // containing the memory where the accumulators will be
        // allocated
        hammer<Accumulator>(repeats);
        hammer<Accumulator>(repeats);

        // Now start a timer
        util::high_resolution_timer time;
        hammer<Accumulator>(repeats);   // This time, we'll measure
        return time.elapsed();          // return the elapsed time
    }

    template <class Accumulator>
    void report(char const* name, long const repeats)
    {
        std::cout.precision(10);
        std::cout << name << ": ";
        for (int i = 0; i < (20-int(strlen(name))); ++i)
            std::cout << ' ';
        std::cout << std::fixed << test::measure<Accumulator>(repeats) << " [s] ";
        Accumulator acc;
        acc.benchmark();
        std::cout << std::hex << "{checksum: " << acc.val << "}";
        std::cout << std::flush << std::endl;
    }

    struct base
    {
        base() : val(0) {}
        int val;    // This is needed to avoid dead-code elimination
    };

#define BOOST_SPIRIT_TEST_HAMMER(r, data, elem)                     \
    test::hammer<elem>(repeats);
    /***/

#define BOOST_SPIRIT_TEST_MEASURE(r, data, elem)                    \
    test::report<elem>(BOOST_PP_STRINGIZE(elem), repeats);          \
    /***/

#define BOOST_SPIRIT_TEST_BENCHMARK(max_repeats, FSeq)              \
    long repeats = 100;                                             \
    double measured = 0;                                            \
    while (measured < 2.0 && repeats <= max_repeats)                \
    {                                                               \
        repeats *= 10;                                              \
        util::high_resolution_timer time;                           \
        BOOST_PP_SEQ_FOR_EACH(BOOST_SPIRIT_TEST_HAMMER, _, FSeq)    \
        measured = time.elapsed();                                  \
    }                                                               \
    BOOST_PP_SEQ_FOR_EACH(BOOST_SPIRIT_TEST_MEASURE, _, FSeq)       \
    /***/
}

#endif