20_util/memory_resource/pools.cc

// This file is part of the GNU ISO C++ Library.  This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.

// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3.  If not see
// <http://www.gnu.org/licenses/>.

// Override the -std flag in the check_performance script: STD=gnu++17

// Run the test as both single- and multi-threaded: TEST_B

#include <memory_resource>
#include <list>
#include <string>
#include <testsuite_performance.h>

const int iterations = 100;

// Insert and remove elements of various sizes in std::list containers.
// If timers!=nullptr the function will pause the timer while the lists
// are cleared and deallocated, so that only insertions/removals are timed.
// Otherwise, the time taken to deallocate the lists is also counted.
void
populate_lists(std::pmr::memory_resource* r, __gnu_test::time_counter* timers,
	       int kmax = iterations)
{
  struct size16 { char c[16]; };
  struct size32 { char c[32]; };
  struct size64 { char c[64]; };
  struct size128 { char c[128]; };

  std::pmr::list<int> l4(r);
  std::pmr::list<size16> l16(r);
  std::pmr::list<size32> l32(r);
  std::pmr::list<size64> l64(r);
  std::pmr::list<size128> l128(r);

  const int imax = 1000;
  const int jmax = 100;
  for (int k = 0; k < kmax; ++k)
  {
    for (int i = 0; i < imax; ++i)
    {
      for (int j = 0; j < jmax; ++j)
      {
        l4.emplace_back();
        l16.emplace_back();
        l32.emplace_back();
        l64.emplace_back();
        l128.emplace_back();
      }
      l4.pop_front();
      l16.pop_front();
      l32.pop_front();
      l64.pop_front();
      l128.pop_front();
    }

    if (timers)
      timers->stop();

    // Deallocate everything:
    l4.clear();
    l16.clear();
    l32.clear();
    l64.clear();
    l128.clear();

    if (timers)
      timers->restart();
  }
}

// Test allocations and deallocations of node-based containers (std::list).
// In this test pmr::unsynchronized_pool_resource should be faster than
// pmr::new_delete_resource().
void test_lists_single_thread()
{
  std::pmr::memory_resource* newdel = std::pmr::new_delete_resource();
  std::pmr::unsynchronized_pool_resource pool;
#ifndef NOTHREAD
  std::pmr::synchronized_pool_resource syncpool;
#endif

  auto run_test = [](auto* memres, std::string name, bool time_dtors) {
    name += " std::list push/pop";
    if (time_dtors)
      name += "/destroy";
    __gnu_test::time_counter time;
    __gnu_test::resource_counter resource;
    start_counters(time, resource);
    populate_lists(memres, time_dtors ? nullptr : &time);
    stop_counters(time, resource);
    report_performance(__FILE__, name, time, resource);
  };

  for (auto time_dtors : {false, true})
  {
    run_test(newdel, "new-delete-1 ", time_dtors);
    run_test(newdel, "new-delete-2 ", time_dtors);
    run_test(newdel, "new-delete-3 ", time_dtors);

    // Start with an empty set of pools:
    pool.release();
    run_test(&pool, "unsync-pool-1", time_dtors);
    // Destroy pools and start fresh:
    pool.release();
    run_test(&pool, "unsync-pool-2", time_dtors);
    // Do not destroy pools, reuse allocated memory:
    run_test(&pool, "unsync-pool-3", time_dtors);

#ifndef NOTHREAD
    syncpool.release();
    run_test(&syncpool, "sync-pool-1  ", time_dtors);
    // Destroy pools and start fresh:
    syncpool.release();
    run_test(&syncpool, "sync-pool-2  ", time_dtors);
    // Do not destroy pools, reuse allocated memory:
    run_test(&syncpool, "sync-pool-3  ", time_dtors);
#endif
  }
}

// TODO test non-pooled large allocations from (un)synchronized_pool_resource

#ifndef NOTHREAD
# include <thread>
# include <mutex>
# include <cassert>

// Multithreaded std::list test with each thread having its own resource.
// (pmr::new_delete vs pmr::unsynchronized_pool vs pmr::synchronized_pool)
//
// In this test both pmr::unsynchronized_pool_resource and
// pmr::synchronized_pool_resource should be faster than
// pmr::new_delete_resource().
void test_lists_resource_per_thread()
{
  std::mutex mx;
  std::unique_lock<std::mutex> gate(mx, std::defer_lock);

  struct state
  {
    std::thread thread;

    // Per-thread pool resources:
    std::pmr::unsynchronized_pool_resource unsync;
    std::pmr::synchronized_pool_resource sync;

    std::pmr::memory_resource* memres[3] = {
      std::pmr::new_delete_resource(), &unsync, &sync
    };
  };

  state states[4];

  const std::string resnames[] = {"new-delete ", "unsync-pool", "sync-pool  "};

  auto run_test = [&mx] (std::pmr::memory_resource* memres,
			 __gnu_test::time_counter* timers)
  {
    std::lock_guard<std::mutex>{mx};  // block until the mutex can be locked
    populate_lists(memres, timers);
  };

  auto time_threads = [&] (std::string testname, bool time_dtors, int which) {
    __gnu_test::time_counter time;
    __gnu_test::resource_counter resource;
    gate.lock();
    auto* time_ptr = time_dtors ? nullptr : &time;
    for (auto& s : states)
      s.thread = std::thread{ run_test, s.memres[which], time_ptr };
    start_counters(time, resource);
    gate.unlock(); // let the threads run
    for (auto& s : states)
      s.thread.join();
    stop_counters(time, resource);
    report_performance(__FILE__, resnames[which] + testname, time, resource);
  };

  for (auto time_dtors : {false, true})
  {
    std::string testname = " resource-per-thread std::list push/pop";
    if (time_dtors)
      testname += "/destroy";
    for (int which : {0, 1, 2})
      time_threads(testname, time_dtors, which);
  }
}

// A naive memory_resource that adds a mutex to unsynchronized_pool_resource
struct locking_pool_resource : std::pmr::unsynchronized_pool_resource
{
  void* do_allocate(std::size_t b, std::size_t a) override
  {
    std::lock_guard<std::mutex> l(m);
    return unsynchronized_pool_resource::do_allocate(b, a);
  }

  void do_deallocate(void* p, std::size_t b, std::size_t a) override
  {
    std::lock_guard<std::mutex> l(m);
    return unsynchronized_pool_resource::do_deallocate(p, b, a);
  }

  std::mutex m;
};

// Multithreaded std::list test with all threads sharing the same resource.
// (new_delete vs unsynchronized_pool+mutex vs synchronized_pool)
//
// pmr::synchronized_pool_resource is not expected to be anywhere near
// as fast as pmr::new_delete_resource() here, but should perform much
// better than the naive locking_pool_resource type.
void test_lists_shared_resource()
{
  std::mutex mx;
  std::unique_lock<std::mutex> gate(mx, std::defer_lock);

  locking_pool_resource unsync;
  std::pmr::synchronized_pool_resource sync;

  std::pmr::memory_resource* memres[3] = {
    std::pmr::new_delete_resource(), &unsync, &sync
  };

  std::thread threads[4];

  const std::string resnames[3] = { "new-delete", "mutex-pool", "sync-pool " };

  auto run_test = [&mx] (std::pmr::memory_resource* memres,
			 __gnu_test::time_counter* timers)
  {
    std::lock_guard<std::mutex>{mx};  // block until the mutex can be locked
    populate_lists(memres, timers);
  };

  auto time_threads = [&] (std::string testname, bool time_dtors, int which) {
    __gnu_test::time_counter time;
    __gnu_test::resource_counter resource;
    gate.lock();
    auto* time_ptr = time_dtors ? nullptr : &time;
    for (auto& t : threads)
      t = std::thread{ run_test, memres[which], time_ptr };
    start_counters(time, resource);
    gate.unlock(); // let the threads run
    for (auto& t : threads)
      t.join();
    stop_counters(time, resource);
    report_performance(__FILE__, resnames[which] + testname, time, resource);
  };

  for (auto time_dtors : {false, true})
  {
    std::string testname = " shared-resource std::list push/pop";
    if (time_dtors)
      testname += "/destroy";
    for (int which : {0, 1, 2})
      time_threads(testname, time_dtors, which);
  }
}

// TODO threaded test just doing loads of allocations, no deallocs
// both with per-thread resource (unsync vs sync vs newdel)
// and shared resource (locked vs sync vs newdel)

// TODO threaded test just doing loads of deallocations, no allocs
// both with per-thread resource (unsync vs sync vs newdel)
// and shared resource (locked vs sync vs newdel)

// Multithreaded test where deallocations happen on different threads.
// (new_delete vs unsynchronized_pool+mutex vs synchronized_pool)
//
// This hits the slow path for pmr::synchronized_pool_resource, where
// an exclusive lock must be taken to access other threads' pools.
// pmr::synchronized_pool_resource is not expected to be anywhere near
// as fast as pmr::new_delete_resource() here, but should perform much
// better than the naive locking_pool_resource type.
void test_cross_thread_dealloc()
{
  const int num_threads = 4;

  struct X {
    void* ptr;
    unsigned size;
  };

  // A buffer for each thread, and extra buffers for half of the threads:
  std::vector<X> allocs[num_threads * 3 / 2];
  for (auto& v : allocs)
    v.resize(1000 * iterations);

  // Use a few different pools
  const std::size_t sizes[] = { 8, 16, 8, 16, 32, 64, 8, 16, 32, 64 };

  std::mutex mx;

  auto run_test =
  [&, num_threads] (std::pmr::memory_resource* memres, int i, bool with_exit)
  {
    std::size_t counter = 0;
    std::lock_guard<std::mutex>{mx};
    // Fill this thread's buffer with allocations:
    for (X& x : allocs[i])
    {
      x.size = sizes[counter++ % 10];
      x.ptr = memres->allocate(x.size, 1);
    }

    if (with_exit && i == 0)
    {
      // One of the threads exits, so that its pools transfer to the
      // non-thread-specific list of pools.
      return;
    }
    else if (i < num_threads / 2)
    {
      // Other threads continue allocating, into the extra buffers:
      for (X& x : allocs[num_threads + i])
      {
	x.size = sizes[counter++ % 10];
	x.ptr = memres->allocate(x.size, 1);
      }
    }
    else
    {
      // Half of the threads start deallocating their own memory and the
      // memory belonging to another pool
      const int other = i - num_threads / 2;
      for (unsigned n = 0; n < allocs[i].size(); ++n)
      {
	// Deallocate memory allocated in this thread:
	X& x1 = allocs[i][n];
	memres->deallocate(x1.ptr, x1.size, 1);
	x1 = {};
	// Deallocate memory allocated in another thread:
	X& x2 = allocs[other][n];
	memres->deallocate(x2.ptr, x2.size, 1);
	x2 = {};
      }
    }
  };

  std::thread threads[num_threads];

  locking_pool_resource unsync;
  std::pmr::synchronized_pool_resource sync;

  std::pmr::memory_resource* memres[3] = {
    std::pmr::new_delete_resource(), &unsync, &sync
  };
  const std::string resnames[3] = { "new-delete", "mutex-pool", "sync-pool " };

  auto time_threads = [&] (std::string name, int which, bool with_exit)
  {
    __gnu_test::time_counter time;
    __gnu_test::resource_counter resource;
    std::unique_lock<std::mutex> gate(mx);
    for (auto& t : threads)
      t = std::thread{ run_test, memres[which], &t - threads, with_exit };
    start_counters(time, resource);
    gate.unlock();
    for (auto& t : threads)
      t.join();
    stop_counters(time, resource);
    report_performance(__FILE__, resnames[which] + name, time, resource);

    // Clean up:
    for (auto& a : allocs)
    {
      const int i = (&a - allocs);
      if (i < num_threads) // These allocations were freed
	for (auto& x : a)
	{
	  assert(x.ptr == nullptr);
	}
      else if (with_exit && i == num_threads)
	;
      else
	for (auto& x : a)
	{
	  memres[which]->deallocate(x.ptr, x.size, 1);
	  x = {};
	}
    }
  };

  for (int which : {0, 1, 2})
    time_threads(" cross-thread dealloc", which, false);
  for (int which : {0, 1, 2})
    time_threads(" cross-thread dealloc w/exit", which, true);
}
#endif

int main()
{
  test_lists_single_thread();
#ifndef NOTHREAD
  test_lists_resource_per_thread();
  test_lists_shared_resource();
  test_cross_thread_dealloc();
#endif
}