1 // Pattern-defeating quicksort
2 
3 //              Copyright Orson Peters 2017.
4 // Distributed under the Boost Software License, Version 1.0.
5 //    (See accompanying file LICENSE_1_0.txt or copy at
6 //          http://www.boost.org/LICENSE_1_0.txt)
7 
8 // See http://www.boost.org/libs/sort/ for library home page.
9 
10 
11 #ifndef BOOST_SORT_PDQSORT_HPP
12 #define BOOST_SORT_PDQSORT_HPP
13 
14 #include <algorithm>
15 #include <cstddef>
16 #include <functional>
17 #include <iterator>
18 #include <utility>
19 #include <boost/type_traits.hpp>
20 
21 #if __cplusplus >= 201103L
22     #include <cstdint>
23     #define BOOST_PDQSORT_PREFER_MOVE(x) std::move(x)
24 #else
25     #define BOOST_PDQSORT_PREFER_MOVE(x) (x)
26 #endif
27 
28 namespace boost {
29 namespace sort {
30 
31 namespace pdqsort_detail {
32     enum {
33         // Partitions below this size are sorted using insertion sort.
34         insertion_sort_threshold = 24,
35 
36         // Partitions above this size use Tukey's ninther to select the pivot.
37         ninther_threshold = 128,
38 
39         // When we detect an already sorted partition, attempt an insertion sort that allows this
40         // amount of element moves before giving up.
41         partial_insertion_sort_limit = 8,
42 
43         // Must be multiple of 8 due to loop unrolling, and < 256 to fit in unsigned char.
44         block_size = 64,
45 
46         // Cacheline size, assumes power of two.
47         cacheline_size = 64
48     };
49 
50     template<class T> struct is_default_compare : boost::false_type { };
51     template<class T> struct is_default_compare<std::less<T> > : boost::true_type { };
52     template<class T> struct is_default_compare<std::greater<T> > : boost::true_type { };
53 
54     // Returns floor(log2(n)), assumes n > 0.
55     template<class T>
log2(T n)56     inline int log2(T n) {
57         int log = 0;
58         while (n >>= 1) ++log;
59         return log;
60     }
61 
62     // Sorts [begin, end) using insertion sort with the given comparison function.
63     template<class Iter, class Compare>
insertion_sort(Iter begin,Iter end,Compare comp)64     inline void insertion_sort(Iter begin, Iter end, Compare comp) {
65         typedef typename std::iterator_traits<Iter>::value_type T;
66         if (begin == end) return;
67 
68         for (Iter cur = begin + 1; cur != end; ++cur) {
69             Iter sift = cur;
70             Iter sift_1 = cur - 1;
71 
72             // Compare first so we can avoid 2 moves for an element already positioned correctly.
73             if (comp(*sift, *sift_1)) {
74                 T tmp = BOOST_PDQSORT_PREFER_MOVE(*sift);
75 
76                 do { *sift-- = BOOST_PDQSORT_PREFER_MOVE(*sift_1); }
77                 while (sift != begin && comp(tmp, *--sift_1));
78 
79                 *sift = BOOST_PDQSORT_PREFER_MOVE(tmp);
80             }
81         }
82     }
83 
84     // Sorts [begin, end) using insertion sort with the given comparison function. Assumes
85     // *(begin - 1) is an element smaller than or equal to any element in [begin, end).
86     template<class Iter, class Compare>
unguarded_insertion_sort(Iter begin,Iter end,Compare comp)87     inline void unguarded_insertion_sort(Iter begin, Iter end, Compare comp) {
88         typedef typename std::iterator_traits<Iter>::value_type T;
89         if (begin == end) return;
90 
91         for (Iter cur = begin + 1; cur != end; ++cur) {
92             Iter sift = cur;
93             Iter sift_1 = cur - 1;
94 
95             // Compare first so we can avoid 2 moves for an element already positioned correctly.
96             if (comp(*sift, *sift_1)) {
97                 T tmp = BOOST_PDQSORT_PREFER_MOVE(*sift);
98 
99                 do { *sift-- = BOOST_PDQSORT_PREFER_MOVE(*sift_1); }
100                 while (comp(tmp, *--sift_1));
101 
102                 *sift = BOOST_PDQSORT_PREFER_MOVE(tmp);
103             }
104         }
105     }
106 
107     // Attempts to use insertion sort on [begin, end). Will return false if more than
108     // partial_insertion_sort_limit elements were moved, and abort sorting. Otherwise it will
109     // successfully sort and return true.
110     template<class Iter, class Compare>
partial_insertion_sort(Iter begin,Iter end,Compare comp)111     inline bool partial_insertion_sort(Iter begin, Iter end, Compare comp) {
112         typedef typename std::iterator_traits<Iter>::value_type T;
113         if (begin == end) return true;
114 
115         std::size_t limit = 0;
116         for (Iter cur = begin + 1; cur != end; ++cur) {
117             Iter sift = cur;
118             Iter sift_1 = cur - 1;
119 
120             // Compare first so we can avoid 2 moves for an element already positioned correctly.
121             if (comp(*sift, *sift_1)) {
122                 T tmp = BOOST_PDQSORT_PREFER_MOVE(*sift);
123 
124                 do { *sift-- = BOOST_PDQSORT_PREFER_MOVE(*sift_1); }
125                 while (sift != begin && comp(tmp, *--sift_1));
126 
127                 *sift = BOOST_PDQSORT_PREFER_MOVE(tmp);
128                 limit += cur - sift;
129             }
130 
131             if (limit > partial_insertion_sort_limit) return false;
132         }
133 
134         return true;
135     }
136 
137     template<class Iter, class Compare>
sort2(Iter a,Iter b,Compare comp)138     inline void sort2(Iter a, Iter b, Compare comp) {
139         if (comp(*b, *a)) std::iter_swap(a, b);
140     }
141 
142     // Sorts the elements *a, *b and *c using comparison function comp.
143     template<class Iter, class Compare>
sort3(Iter a,Iter b,Iter c,Compare comp)144     inline void sort3(Iter a, Iter b, Iter c, Compare comp) {
145         sort2(a, b, comp);
146         sort2(b, c, comp);
147         sort2(a, b, comp);
148     }
149 
150     template<class T>
align_cacheline(T * p)151     inline T* align_cacheline(T* p) {
152 #if defined(UINTPTR_MAX) && __cplusplus >= 201103L
153         std::uintptr_t ip = reinterpret_cast<std::uintptr_t>(p);
154 #else
155         std::size_t ip = reinterpret_cast<std::size_t>(p);
156 #endif
157         ip = (ip + cacheline_size - 1) & -cacheline_size;
158         return reinterpret_cast<T*>(ip);
159     }
160 
161     template<class Iter>
swap_offsets(Iter first,Iter last,unsigned char * offsets_l,unsigned char * offsets_r,int num,bool use_swaps)162     inline void swap_offsets(Iter first, Iter last,
163                              unsigned char* offsets_l, unsigned char* offsets_r,
164                              int num, bool use_swaps) {
165         typedef typename std::iterator_traits<Iter>::value_type T;
166         if (use_swaps) {
167             // This case is needed for the descending distribution, where we need
168             // to have proper swapping for pdqsort to remain O(n).
169             for (int i = 0; i < num; ++i) {
170                 std::iter_swap(first + offsets_l[i], last - offsets_r[i]);
171             }
172         } else if (num > 0) {
173             Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0];
174             T tmp(BOOST_PDQSORT_PREFER_MOVE(*l)); *l = BOOST_PDQSORT_PREFER_MOVE(*r);
175             for (int i = 1; i < num; ++i) {
176                 l = first + offsets_l[i]; *r = BOOST_PDQSORT_PREFER_MOVE(*l);
177                 r = last - offsets_r[i]; *l = BOOST_PDQSORT_PREFER_MOVE(*r);
178             }
179             *r = BOOST_PDQSORT_PREFER_MOVE(tmp);
180         }
181     }
182 
183     // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
184     // to the pivot are put in the right-hand partition. Returns the position of the pivot after
185     // partitioning and whether the passed sequence already was correctly partitioned. Assumes the
186     // pivot is a median of at least 3 elements and that [begin, end) is at least
187     // insertion_sort_threshold long. Uses branchless partitioning.
188     template<class Iter, class Compare>
partition_right_branchless(Iter begin,Iter end,Compare comp)189     inline std::pair<Iter, bool> partition_right_branchless(Iter begin, Iter end, Compare comp) {
190         typedef typename std::iterator_traits<Iter>::value_type T;
191 
192         // Move pivot into local for speed.
193         T pivot(BOOST_PDQSORT_PREFER_MOVE(*begin));
194         Iter first = begin;
195         Iter last = end;
196 
197         // Find the first element greater than or equal than the pivot (the median of 3 guarantees
198         // this exists).
199         while (comp(*++first, pivot));
200 
201         // Find the first element strictly smaller than the pivot. We have to guard this search if
202         // there was no element before *first.
203         if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
204         else                    while (                !comp(*--last, pivot));
205 
206         // If the first pair of elements that should be swapped to partition are the same element,
207         // the passed in sequence already was correctly partitioned.
208         bool already_partitioned = first >= last;
209         if (!already_partitioned) {
210             std::iter_swap(first, last);
211             ++first;
212         }
213 
214         // The following branchless partitioning is derived from "BlockQuicksort: How Branch
215         // Mispredictions don't affect Quicksort" by Stefan Edelkamp and Armin Weiss.
216         unsigned char offsets_l_storage[block_size + cacheline_size];
217         unsigned char offsets_r_storage[block_size + cacheline_size];
218         unsigned char* offsets_l = align_cacheline(offsets_l_storage);
219         unsigned char* offsets_r = align_cacheline(offsets_r_storage);
220         int num_l, num_r, start_l, start_r;
221         num_l = num_r = start_l = start_r = 0;
222 
223         while (last - first > 2 * block_size) {
224             // Fill up offset blocks with elements that are on the wrong side.
225             if (num_l == 0) {
226                 start_l = 0;
227                 Iter it = first;
228                 for (unsigned char i = 0; i < block_size;) {
229                     offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
230                     offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
231                     offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
232                     offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
233                     offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
234                     offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
235                     offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
236                     offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
237                 }
238             }
239             if (num_r == 0) {
240                 start_r = 0;
241                 Iter it = last;
242                 for (unsigned char i = 0; i < block_size;) {
243                     offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
244                     offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
245                     offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
246                     offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
247                     offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
248                     offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
249                     offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
250                     offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
251                 }
252             }
253 
254             // Swap elements and update block sizes and first/last boundaries.
255             int num = (std::min)(num_l, num_r);
256             swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r,
257                          num, num_l == num_r);
258             num_l -= num; num_r -= num;
259             start_l += num; start_r += num;
260             if (num_l == 0) first += block_size;
261             if (num_r == 0) last -= block_size;
262         }
263 
264         int l_size = 0, r_size = 0;
265         int unknown_left = (int)(last - first) - ((num_r || num_l) ? block_size : 0);
266         if (num_r) {
267             // Handle leftover block by assigning the unknown elements to the other block.
268             l_size = unknown_left;
269             r_size = block_size;
270         } else if (num_l) {
271             l_size = block_size;
272             r_size = unknown_left;
273         } else {
274             // No leftover block, split the unknown elements in two blocks.
275             l_size = unknown_left/2;
276             r_size = unknown_left - l_size;
277         }
278 
279         // Fill offset buffers if needed.
280         if (unknown_left && !num_l) {
281             start_l = 0;
282             Iter it = first;
283             for (unsigned char i = 0; i < l_size;) {
284                 offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
285             }
286         }
287         if (unknown_left && !num_r) {
288             start_r = 0;
289             Iter it = last;
290             for (unsigned char i = 0; i < r_size;) {
291                 offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
292             }
293         }
294 
295         int num = (std::min)(num_l, num_r);
296         swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r, num, num_l == num_r);
297         num_l -= num; num_r -= num;
298         start_l += num; start_r += num;
299         if (num_l == 0) first += l_size;
300         if (num_r == 0) last -= r_size;
301 
302         // We have now fully identified [first, last)'s proper position. Swap the last elements.
303         if (num_l) {
304             offsets_l += start_l;
305             while (num_l--) std::iter_swap(first + offsets_l[num_l], --last);
306             first = last;
307         }
308         if (num_r) {
309             offsets_r += start_r;
310             while (num_r--) std::iter_swap(last - offsets_r[num_r], first), ++first;
311             last = first;
312         }
313 
314         // Put the pivot in the right place.
315         Iter pivot_pos = first - 1;
316         *begin = BOOST_PDQSORT_PREFER_MOVE(*pivot_pos);
317         *pivot_pos = BOOST_PDQSORT_PREFER_MOVE(pivot);
318 
319         return std::make_pair(pivot_pos, already_partitioned);
320     }
321 
322     // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
323     // to the pivot are put in the right-hand partition. Returns the position of the pivot after
324     // partitioning and whether the passed sequence already was correctly partitioned. Assumes the
325     // pivot is a median of at least 3 elements and that [begin, end) is at least
326     // insertion_sort_threshold long.
327     template<class Iter, class Compare>
partition_right(Iter begin,Iter end,Compare comp)328     inline std::pair<Iter, bool> partition_right(Iter begin, Iter end, Compare comp) {
329         typedef typename std::iterator_traits<Iter>::value_type T;
330 
331         // Move pivot into local for speed.
332         T pivot(BOOST_PDQSORT_PREFER_MOVE(*begin));
333 
334         Iter first = begin;
335         Iter last = end;
336 
337         // Find the first element greater than or equal than the pivot (the median of 3 guarantees
338         // this exists).
339         while (comp(*++first, pivot));
340 
341         // Find the first element strictly smaller than the pivot. We have to guard this search if
342         // there was no element before *first.
343         if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
344         else                    while (                !comp(*--last, pivot));
345 
346         // If the first pair of elements that should be swapped to partition are the same element,
347         // the passed in sequence already was correctly partitioned.
348         bool already_partitioned = first >= last;
349 
350         // Keep swapping pairs of elements that are on the wrong side of the pivot. Previously
351         // swapped pairs guard the searches, which is why the first iteration is special-cased
352         // above.
353         while (first < last) {
354             std::iter_swap(first, last);
355             while (comp(*++first, pivot));
356             while (!comp(*--last, pivot));
357         }
358 
359         // Put the pivot in the right place.
360         Iter pivot_pos = first - 1;
361         *begin = BOOST_PDQSORT_PREFER_MOVE(*pivot_pos);
362         *pivot_pos = BOOST_PDQSORT_PREFER_MOVE(pivot);
363 
364         return std::make_pair(pivot_pos, already_partitioned);
365     }
366 
367     // Similar function to the one above, except elements equal to the pivot are put to the left of
368     // the pivot and it doesn't check or return if the passed sequence already was partitioned.
369     // Since this is rarely used (the many equal case), and in that case pdqsort already has O(n)
370     // performance, no block quicksort is applied here for simplicity.
371     template<class Iter, class Compare>
partition_left(Iter begin,Iter end,Compare comp)372     inline Iter partition_left(Iter begin, Iter end, Compare comp) {
373         typedef typename std::iterator_traits<Iter>::value_type T;
374 
375         T pivot(BOOST_PDQSORT_PREFER_MOVE(*begin));
376         Iter first = begin;
377         Iter last = end;
378 
379         while (comp(pivot, *--last));
380 
381         if (last + 1 == end) while (first < last && !comp(pivot, *++first));
382         else                 while (                !comp(pivot, *++first));
383 
384         while (first < last) {
385             std::iter_swap(first, last);
386             while (comp(pivot, *--last));
387             while (!comp(pivot, *++first));
388         }
389 
390         Iter pivot_pos = last;
391         *begin = BOOST_PDQSORT_PREFER_MOVE(*pivot_pos);
392         *pivot_pos = BOOST_PDQSORT_PREFER_MOVE(pivot);
393 
394         return pivot_pos;
395     }
396 
397 
398     template<class Iter, class Compare, bool Branchless>
pdqsort_loop(Iter begin,Iter end,Compare comp,int bad_allowed,bool leftmost=true)399     inline void pdqsort_loop(Iter begin, Iter end, Compare comp, int bad_allowed, bool leftmost = true) {
400         typedef typename std::iterator_traits<Iter>::difference_type diff_t;
401 
402         // Use a while loop for tail recursion elimination.
403         while (true) {
404             diff_t size = end - begin;
405 
406             // Insertion sort is faster for small arrays.
407             if (size < insertion_sort_threshold) {
408                 if (leftmost) insertion_sort(begin, end, comp);
409                 else unguarded_insertion_sort(begin, end, comp);
410                 return;
411             }
412 
413             // Choose pivot as median of 3 or pseudomedian of 9.
414             diff_t s2 = size / 2;
415             if (size > ninther_threshold) {
416                 sort3(begin, begin + s2, end - 1, comp);
417                 sort3(begin + 1, begin + (s2 - 1), end - 2, comp);
418                 sort3(begin + 2, begin + (s2 + 1), end - 3, comp);
419                 sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp);
420                 std::iter_swap(begin, begin + s2);
421             } else sort3(begin + s2, begin, end - 1, comp);
422 
423             // If *(begin - 1) is the end of the right partition of a previous partition operation
424             // there is no element in [begin, end) that is smaller than *(begin - 1). Then if our
425             // pivot compares equal to *(begin - 1) we change strategy, putting equal elements in
426             // the left partition, greater elements in the right partition. We do not have to
427             // recurse on the left partition, since it's sorted (all equal).
428             if (!leftmost && !comp(*(begin - 1), *begin)) {
429                 begin = partition_left(begin, end, comp) + 1;
430                 continue;
431             }
432 
433             // Partition and get results.
434             std::pair<Iter, bool> part_result =
435                 Branchless ? partition_right_branchless(begin, end, comp)
436                            : partition_right(begin, end, comp);
437             Iter pivot_pos = part_result.first;
438             bool already_partitioned = part_result.second;
439 
440             // Check for a highly unbalanced partition.
441             diff_t l_size = pivot_pos - begin;
442             diff_t r_size = end - (pivot_pos + 1);
443             bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
444 
445             // If we got a highly unbalanced partition we shuffle elements to break many patterns.
446             if (highly_unbalanced) {
447                 // If we had too many bad partitions, switch to heapsort to guarantee O(n log n).
448                 if (--bad_allowed == 0) {
449                     std::make_heap(begin, end, comp);
450                     std::sort_heap(begin, end, comp);
451                     return;
452                 }
453 
454                 if (l_size >= insertion_sort_threshold) {
455                     std::iter_swap(begin,             begin + l_size / 4);
456                     std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4);
457 
458                     if (l_size > ninther_threshold) {
459                         std::iter_swap(begin + 1,         begin + (l_size / 4 + 1));
460                         std::iter_swap(begin + 2,         begin + (l_size / 4 + 2));
461                         std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1));
462                         std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2));
463                     }
464                 }
465 
466                 if (r_size >= insertion_sort_threshold) {
467                     std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4));
468                     std::iter_swap(end - 1,                   end - r_size / 4);
469 
470                     if (r_size > ninther_threshold) {
471                         std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4));
472                         std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4));
473                         std::iter_swap(end - 2,             end - (1 + r_size / 4));
474                         std::iter_swap(end - 3,             end - (2 + r_size / 4));
475                     }
476                 }
477             } else {
478                 // If we were decently balanced and we tried to sort an already partitioned
479                 // sequence try to use insertion sort.
480                 if (already_partitioned && partial_insertion_sort(begin, pivot_pos, comp)
481                                         && partial_insertion_sort(pivot_pos + 1, end, comp)) return;
482             }
483 
484             // Sort the left partition first using recursion and do tail recursion elimination for
485             // the right-hand partition.
486             pdqsort_loop<Iter, Compare, Branchless>(begin, pivot_pos, comp, bad_allowed, leftmost);
487             begin = pivot_pos + 1;
488             leftmost = false;
489         }
490     }
491 }
492 
493 
494 /*! \brief Generic sort algorithm using random access iterators and a user-defined comparison operator.
495 
496     \details @c pdqsort is a fast generic sorting algorithm that is similar in concept to introsort
497 but runs faster on certain patterns. @c pdqsort is in-place, unstable, deterministic, has a worst
498 case runtime of <em>O(N * lg(N))</em> and a best case of <em>O(N)</em>. Even without patterns, the
499 quicksort has been very efficiently implemented, and @c pdqsort runs 1-5% faster than GCC 6.2's
500 @c std::sort. If the type being sorted is @c std::is_arithmetic and Compare is @c std::less or
501 @c std::greater this function will automatically use @c pdqsort_branchless for far greater speedups.
502 
503    \param[in] first Iterator pointer to first element.
504    \param[in] last Iterator pointing to one beyond the end of data.
505    \param[in] comp A binary functor that returns whether the first element passed to it should go before the second in order.
506    \pre [@c first, @c last) is a valid range.
507    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/MoveAssignable">MoveAssignable</a>
508    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/MoveConstructible">MoveConstructible</a>
509    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/LessThanComparable">LessThanComparable</a>
510    \post The elements in the range [@c first, @c last) are sorted in ascending order.
511 
512    \return @c void.
513 
514    \throws std::exception Propagates exceptions if any of the element comparisons, the element swaps
515    (or moves), functors, or any operations on iterators throw.
516    \warning Invalid arguments cause undefined behaviour.
517    \warning Throwing an exception may cause data loss.
518 */
519 template<class Iter, class Compare>
pdqsort(Iter first,Iter last,Compare comp)520 inline void pdqsort(Iter first, Iter last, Compare comp) {
521     if (first == last) return;
522     pdqsort_detail::pdqsort_loop<Iter, Compare,
523         pdqsort_detail::is_default_compare<typename boost::decay<Compare>::type>::value &&
524         boost::is_arithmetic<typename std::iterator_traits<Iter>::value_type>::value>(
525         first, last, comp, pdqsort_detail::log2(last - first));
526 }
527 
528 
529 /*! \brief Generic sort algorithm using random access iterators and a user-defined comparison operator.
530 
531     \details @c pdqsort_branchless is a fast generic sorting algorithm that is similar in concept to
532 introsort but runs faster on certain patterns. @c pdqsort_branchless is in-place, unstable,
533 deterministic, has a worst case runtime of <em>O(N * lg(N))</em> and a best case of <em>O(N)</em>.
534 Even without patterns, the quicksort has been very efficiently implemented with block based
535 partitioning, and @c pdqsort_branchless runs 80-90% faster than GCC 6.2's @c std::sort when sorting
536 small data such as integers. However, this speedup is gained by totally bypassing the branch
537 predictor, if your comparison operator or iterator contains branches you will most likely see little
538 gain or a small loss in performance.
539 
540    \param[in] first Iterator pointer to first element.
541    \param[in] last Iterator pointing to one beyond the end of data.
542    \param[in] comp A binary functor that returns whether the first element passed to it should go before the second in order.
543    \pre [@c first, @c last) is a valid range.
544    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/MoveAssignable">MoveAssignable</a>
545    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/MoveConstructible">MoveConstructible</a>
546    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/LessThanComparable">LessThanComparable</a>
547    \post The elements in the range [@c first, @c last) are sorted in ascending order.
548 
549    \return @c void.
550 
551    \throws std::exception Propagates exceptions if any of the element comparisons, the element swaps
552    (or moves), functors, or any operations on iterators throw.
553    \warning Invalid arguments cause undefined behaviour.
554    \warning Throwing an exception may cause data loss.
555 */
556 template<class Iter, class Compare>
pdqsort_branchless(Iter first,Iter last,Compare comp)557 inline void pdqsort_branchless(Iter first, Iter last, Compare comp) {
558     if (first == last) return;
559     pdqsort_detail::pdqsort_loop<Iter, Compare, true>(
560         first, last, comp, pdqsort_detail::log2(last - first));
561 }
562 
563 
564 /*! \brief Generic sort algorithm using random access iterators.
565 
566     \details @c pdqsort is a fast generic sorting algorithm that is similar in concept to introsort
567 but runs faster on certain patterns. @c pdqsort is in-place, unstable, deterministic, has a worst
568 case runtime of <em>O(N * lg(N))</em> and a best case of <em>O(N)</em>. Even without patterns, the
569 quicksort partitioning has been very efficiently implemented, and @c pdqsort runs 80-90% faster than
570 GCC 6.2's @c std::sort. If the type being sorted is @c std::is_arithmetic this function will
571 automatically use @c pdqsort_branchless.
572 
573    \param[in] first Iterator pointer to first element.
574    \param[in] last Iterator pointing to one beyond the end of data.
575    \pre [@c first, @c last) is a valid range.
576    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/MoveAssignable">MoveAssignable</a>
577    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/MoveConstructible">MoveConstructible</a>
578    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/LessThanComparable">LessThanComparable</a>
579    \post The elements in the range [@c first, @c last) are sorted in ascending order.
580 
581    \return @c void.
582 
583    \throws std::exception Propagates exceptions if any of the element comparisons, the element swaps
584    (or moves), functors, or any operations on iterators throw.
585    \warning Invalid arguments cause undefined behaviour.
586    \warning Throwing an exception may cause data loss.
587 */
588 template<class Iter>
pdqsort(Iter first,Iter last)589 inline void pdqsort(Iter first, Iter last) {
590     typedef typename std::iterator_traits<Iter>::value_type T;
591     pdqsort(first, last, std::less<T>());
592 }
593 
594 
595 /*! \brief Generic sort algorithm using random access iterators.
596 
597     \details @c pdqsort_branchless is a fast generic sorting algorithm that is similar in concept to
598 introsort but runs faster on certain patterns. @c pdqsort_branchless is in-place, unstable,
599 deterministic, has a worst case runtime of <em>O(N * lg(N))</em> and a best case of <em>O(N)</em>.
600 Even without patterns, the quicksort has been very efficiently implemented with block based
601 partitioning, and @c pdqsort_branchless runs 80-90% faster than GCC 6.2's @c std::sort when sorting
602 small data such as integers. However, this speedup is gained by totally bypassing the branch
603 predictor, if your comparison operator or iterator contains branches you will most likely see little
604 gain or a small loss in performance.
605 
606    \param[in] first Iterator pointer to first element.
607    \param[in] last Iterator pointing to one beyond the end of data.
608    \pre [@c first, @c last) is a valid range.
609    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/MoveAssignable">MoveAssignable</a>
610    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/MoveConstructible">MoveConstructible</a>
611    \pre @c RandomAccessIter @c value_type is <a href="http://en.cppreference.com/w/cpp/concept/LessThanComparable">LessThanComparable</a>
612    \post The elements in the range [@c first, @c last) are sorted in ascending order.
613 
614    \return @c void.
615 
616    \throws std::exception Propagates exceptions if any of the element comparisons, the element swaps
617    (or moves), functors, or any operations on iterators throw.
618    \warning Invalid arguments cause undefined behaviour.
619    \warning Throwing an exception may cause data loss.
620 */
621 template<class Iter>
pdqsort_branchless(Iter first,Iter last)622 inline void pdqsort_branchless(Iter first, Iter last) {
623     typedef typename std::iterator_traits<Iter>::value_type T;
624     pdqsort_branchless(first, last, std::less<T>());
625 }
626 
627 }
628 }
629 
630 #undef BOOST_PDQSORT_PREFER_MOVE
631 
632 #endif
633