1 //! Parallel merge sort.
2 //!
3 //! This implementation is copied verbatim from `std::slice::sort` and then parallelized.
4 //! The only difference from the original is that the sequential `mergesort` returns
5 //! `MergesortResult` and leaves descending arrays intact.
6 
7 use iter::*;
8 use rayon_core;
9 use slice::ParallelSliceMut;
10 use std::mem;
11 use std::mem::size_of;
12 use std::ptr;
13 use std::slice;
14 
get_and_increment<T>(ptr: &mut *mut T) -> *mut T15 unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
16     let old = *ptr;
17     *ptr = ptr.offset(1);
18     old
19 }
20 
decrement_and_get<T>(ptr: &mut *mut T) -> *mut T21 unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
22     *ptr = ptr.offset(-1);
23     *ptr
24 }
25 
26 /// When dropped, copies from `src` into `dest` a sequence of length `len`.
27 struct CopyOnDrop<T> {
28     src: *mut T,
29     dest: *mut T,
30     len: usize,
31 }
32 
33 impl<T> Drop for CopyOnDrop<T> {
drop(&mut self)34     fn drop(&mut self) {
35         unsafe {
36             ptr::copy_nonoverlapping(self.src, self.dest, self.len);
37         }
38     }
39 }
40 
41 /// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
42 ///
43 /// This is the integral subroutine of insertion sort.
insert_head<T, F>(v: &mut [T], is_less: &F) where F: Fn(&T, &T) -> bool,44 fn insert_head<T, F>(v: &mut [T], is_less: &F)
45 where
46     F: Fn(&T, &T) -> bool,
47 {
48     if v.len() >= 2 && is_less(&v[1], &v[0]) {
49         unsafe {
50             // There are three ways to implement insertion here:
51             //
52             // 1. Swap adjacent elements until the first one gets to its final destination.
53             //    However, this way we copy data around more than is necessary. If elements are big
54             //    structures (costly to copy), this method will be slow.
55             //
56             // 2. Iterate until the right place for the first element is found. Then shift the
57             //    elements succeeding it to make room for it and finally place it into the
58             //    remaining hole. This is a good method.
59             //
60             // 3. Copy the first element into a temporary variable. Iterate until the right place
61             //    for it is found. As we go along, copy every traversed element into the slot
62             //    preceding it. Finally, copy data from the temporary variable into the remaining
63             //    hole. This method is very good. Benchmarks demonstrated slightly better
64             //    performance than with the 2nd method.
65             //
66             // All methods were benchmarked, and the 3rd showed best results. So we chose that one.
67             let mut tmp = NoDrop {
68                 value: Some(ptr::read(&v[0])),
69             };
70 
71             // Intermediate state of the insertion process is always tracked by `hole`, which
72             // serves two purposes:
73             // 1. Protects integrity of `v` from panics in `is_less`.
74             // 2. Fills the remaining hole in `v` in the end.
75             //
76             // Panic safety:
77             //
78             // If `is_less` panics at any point during the process, `hole` will get dropped and
79             // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
80             // initially held exactly once.
81             let mut hole = InsertionHole {
82                 src: tmp.value.as_mut().unwrap(),
83                 dest: &mut v[1],
84             };
85             ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
86 
87             for i in 2..v.len() {
88                 if !is_less(&v[i], tmp.value.as_ref().unwrap()) {
89                     break;
90                 }
91                 ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
92                 hole.dest = &mut v[i];
93             }
94             // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
95         }
96     }
97 
98     // Holds a value, but never drops it.
99     struct NoDrop<T> {
100         value: Option<T>,
101     }
102 
103     impl<T> Drop for NoDrop<T> {
104         fn drop(&mut self) {
105             mem::forget(self.value.take());
106         }
107     }
108 
109     // When dropped, copies from `src` into `dest`.
110     struct InsertionHole<T> {
111         src: *mut T,
112         dest: *mut T,
113     }
114 
115     impl<T> Drop for InsertionHole<T> {
116         fn drop(&mut self) {
117             unsafe {
118                 ptr::copy_nonoverlapping(self.src, self.dest, 1);
119             }
120         }
121     }
122 }
123 
124 /// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
125 /// stores the result into `v[..]`.
126 ///
127 /// # Safety
128 ///
129 /// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
130 /// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &F) where F: Fn(&T, &T) -> bool,131 unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &F)
132 where
133     F: Fn(&T, &T) -> bool,
134 {
135     let len = v.len();
136     let v = v.as_mut_ptr();
137     let v_mid = v.add(mid);
138     let v_end = v.add(len);
139 
140     // The merge process first copies the shorter run into `buf`. Then it traces the newly copied
141     // run and the longer run forwards (or backwards), comparing their next unconsumed elements and
142     // copying the lesser (or greater) one into `v`.
143     //
144     // As soon as the shorter run is fully consumed, the process is done. If the longer run gets
145     // consumed first, then we must copy whatever is left of the shorter run into the remaining
146     // hole in `v`.
147     //
148     // Intermediate state of the process is always tracked by `hole`, which serves two purposes:
149     // 1. Protects integrity of `v` from panics in `is_less`.
150     // 2. Fills the remaining hole in `v` if the longer run gets consumed first.
151     //
152     // Panic safety:
153     //
154     // If `is_less` panics at any point during the process, `hole` will get dropped and fill the
155     // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
156     // object it initially held exactly once.
157     let mut hole;
158 
159     if mid <= len - mid {
160         // The left run is shorter.
161         ptr::copy_nonoverlapping(v, buf, mid);
162         hole = MergeHole {
163             start: buf,
164             end: buf.add(mid),
165             dest: v,
166         };
167 
168         // Initially, these pointers point to the beginnings of their arrays.
169         let left = &mut hole.start;
170         let mut right = v_mid;
171         let out = &mut hole.dest;
172 
173         while *left < hole.end && right < v_end {
174             // Consume the lesser side.
175             // If equal, prefer the left run to maintain stability.
176             let to_copy = if is_less(&*right, &**left) {
177                 get_and_increment(&mut right)
178             } else {
179                 get_and_increment(left)
180             };
181             ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
182         }
183     } else {
184         // The right run is shorter.
185         ptr::copy_nonoverlapping(v_mid, buf, len - mid);
186         hole = MergeHole {
187             start: buf,
188             end: buf.add(len - mid),
189             dest: v_mid,
190         };
191 
192         // Initially, these pointers point past the ends of their arrays.
193         let left = &mut hole.dest;
194         let right = &mut hole.end;
195         let mut out = v_end;
196 
197         while v < *left && buf < *right {
198             // Consume the greater side.
199             // If equal, prefer the right run to maintain stability.
200             let to_copy = if is_less(&*right.offset(-1), &*left.offset(-1)) {
201                 decrement_and_get(left)
202             } else {
203                 decrement_and_get(right)
204             };
205             ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
206         }
207     }
208     // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
209     // it will now be copied into the hole in `v`.
210 
211     // When dropped, copies the range `start..end` into `dest..`.
212     struct MergeHole<T> {
213         start: *mut T,
214         end: *mut T,
215         dest: *mut T,
216     }
217 
218     impl<T> Drop for MergeHole<T> {
219         fn drop(&mut self) {
220             // `T` is not a zero-sized type, so it's okay to divide by its size.
221             let len = (self.end as usize - self.start as usize) / size_of::<T>();
222             unsafe {
223                 ptr::copy_nonoverlapping(self.start, self.dest, len);
224             }
225         }
226     }
227 }
228 
229 /// The result of merge sort.
230 #[must_use]
231 #[derive(Clone, Copy, PartialEq, Eq)]
232 enum MergesortResult {
233     /// The slice has already been sorted.
234     NonDescending,
235     /// The slice has been descending and therefore it was left intact.
236     Descending,
237     /// The slice was sorted.
238     Sorted,
239 }
240 
241 /// A sorted run that starts at index `start` and is of length `len`.
242 #[derive(Clone, Copy)]
243 struct Run {
244     start: usize,
245     len: usize,
246 }
247 
248 /// Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
249 /// if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
250 /// algorithm should continue building a new run instead, `None` is returned.
251 ///
252 /// TimSort is infamous for its buggy implementations, as described here:
253 /// http://envisage-project.eu/timsort-specification-and-verification/
254 ///
255 /// The gist of the story is: we must enforce the invariants on the top four runs on the stack.
256 /// Enforcing them on just top three is not sufficient to ensure that the invariants will still
257 /// hold for *all* runs in the stack.
258 ///
259 /// This function correctly checks invariants for the top four runs. Additionally, if the top
260 /// run starts at index 0, it will always demand a merge operation until the stack is fully
261 /// collapsed, in order to complete the sort.
262 #[inline]
collapse(runs: &[Run]) -> Option<usize>263 fn collapse(runs: &[Run]) -> Option<usize> {
264     let n = runs.len();
265 
266     if n >= 2
267         && (runs[n - 1].start == 0
268             || runs[n - 2].len <= runs[n - 1].len
269             || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
270             || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
271     {
272         if n >= 3 && runs[n - 3].len < runs[n - 1].len {
273             Some(n - 3)
274         } else {
275             Some(n - 2)
276         }
277     } else {
278         None
279     }
280 }
281 
282 /// Sorts a slice using merge sort, unless it is already in descending order.
283 ///
284 /// This function doesn't modify the slice if it is already non-descending or descending.
285 /// Otherwise, it sorts the slice into non-descending order.
286 ///
287 /// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
288 /// [here](http://svn.python.org/projects/python/trunk/Objects/listsort.txt).
289 ///
290 /// The algorithm identifies strictly descending and non-descending subsequences, which are called
291 /// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
292 /// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
293 /// satisfied:
294 ///
295 /// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
296 /// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
297 ///
298 /// The invariants ensure that the total running time is `O(n log n)` worst-case.
299 ///
300 /// # Safety
301 ///
302 /// The argument `buf` is used as a temporary buffer and must be at least as long as `v`.
mergesort<T, F>(v: &mut [T], buf: *mut T, is_less: &F) -> MergesortResult where T: Send, F: Fn(&T, &T) -> bool + Sync,303 unsafe fn mergesort<T, F>(v: &mut [T], buf: *mut T, is_less: &F) -> MergesortResult
304 where
305     T: Send,
306     F: Fn(&T, &T) -> bool + Sync,
307 {
308     // Very short runs are extended using insertion sort to span at least this many elements.
309     const MIN_RUN: usize = 10;
310 
311     let len = v.len();
312 
313     // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
314     // strange decision, but consider the fact that merges more often go in the opposite direction
315     // (forwards). According to benchmarks, merging forwards is slightly faster than merging
316     // backwards. To conclude, identifying runs by traversing backwards improves performance.
317     let mut runs = vec![];
318     let mut end = len;
319     while end > 0 {
320         // Find the next natural run, and reverse it if it's strictly descending.
321         let mut start = end - 1;
322 
323         if start > 0 {
324             start -= 1;
325 
326             if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
327                 while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
328                     start -= 1;
329                 }
330 
331                 // If this descending run covers the whole slice, return immediately.
332                 if start == 0 && end == len {
333                     return MergesortResult::Descending;
334                 } else {
335                     v[start..end].reverse();
336                 }
337             } else {
338                 while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
339                     start -= 1;
340                 }
341 
342                 // If this non-descending run covers the whole slice, return immediately.
343                 if end - start == len {
344                     return MergesortResult::NonDescending;
345                 }
346             }
347         }
348 
349         // Insert some more elements into the run if it's too short. Insertion sort is faster than
350         // merge sort on short sequences, so this significantly improves performance.
351         while start > 0 && end - start < MIN_RUN {
352             start -= 1;
353             insert_head(&mut v[start..end], &is_less);
354         }
355 
356         // Push this run onto the stack.
357         runs.push(Run {
358             start,
359             len: end - start,
360         });
361         end = start;
362 
363         // Merge some pairs of adjacent runs to satisfy the invariants.
364         while let Some(r) = collapse(&runs) {
365             let left = runs[r + 1];
366             let right = runs[r];
367             merge(
368                 &mut v[left.start..right.start + right.len],
369                 left.len,
370                 buf,
371                 &is_less,
372             );
373 
374             runs[r] = Run {
375                 start: left.start,
376                 len: left.len + right.len,
377             };
378             runs.remove(r + 1);
379         }
380     }
381 
382     // Finally, exactly one run must remain in the stack.
383     debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
384 
385     // The original order of the slice was neither non-descending nor descending.
386     MergesortResult::Sorted
387 }
388 
389 ////////////////////////////////////////////////////////////////////////////
390 // Everything above this line is copied from `std::slice::sort` (with very minor tweaks).
391 // Everything below this line is parallelization.
392 ////////////////////////////////////////////////////////////////////////////
393 
394 /// Splits two sorted slices so that they can be merged in parallel.
395 ///
396 /// Returns two indices `(a, b)` so that slices `left[..a]` and `right[..b]` come before
397 /// `left[a..]` and `right[b..]`.
split_for_merge<T, F>(left: &[T], right: &[T], is_less: &F) -> (usize, usize) where F: Fn(&T, &T) -> bool,398 fn split_for_merge<T, F>(left: &[T], right: &[T], is_less: &F) -> (usize, usize)
399 where
400     F: Fn(&T, &T) -> bool,
401 {
402     let left_len = left.len();
403     let right_len = right.len();
404 
405     if left_len >= right_len {
406         let left_mid = left_len / 2;
407 
408         // Find the first element in `right` that is greater than or equal to `left[left_mid]`.
409         let mut a = 0;
410         let mut b = right_len;
411         while a < b {
412             let m = a + (b - a) / 2;
413             if is_less(&right[m], &left[left_mid]) {
414                 a = m + 1;
415             } else {
416                 b = m;
417             }
418         }
419 
420         (left_mid, a)
421     } else {
422         let right_mid = right_len / 2;
423 
424         // Find the first element in `left` that is greater than `right[right_mid]`.
425         let mut a = 0;
426         let mut b = left_len;
427         while a < b {
428             let m = a + (b - a) / 2;
429             if is_less(&right[right_mid], &left[m]) {
430                 b = m;
431             } else {
432                 a = m + 1;
433             }
434         }
435 
436         (a, right_mid)
437     }
438 }
439 
440 /// Merges slices `left` and `right` in parallel and stores the result into `dest`.
441 ///
442 /// # Safety
443 ///
444 /// The `dest` pointer must have enough space to store the result.
445 ///
446 /// Even if `is_less` panics at any point during the merge process, this function will fully copy
447 /// all elements from `left` and `right` into `dest` (not necessarily in sorted order).
par_merge<T, F>(left: &mut [T], right: &mut [T], dest: *mut T, is_less: &F) where T: Send, F: Fn(&T, &T) -> bool + Sync,448 unsafe fn par_merge<T, F>(left: &mut [T], right: &mut [T], dest: *mut T, is_less: &F)
449 where
450     T: Send,
451     F: Fn(&T, &T) -> bool + Sync,
452 {
453     // Slices whose lengths sum up to this value are merged sequentially. This number is slightly
454     // larger than `CHUNK_LENGTH`, and the reason is that merging is faster than merge sorting, so
455     // merging needs a bit coarser granularity in order to hide the overhead of Rayon's task
456     // scheduling.
457     const MAX_SEQUENTIAL: usize = 5000;
458 
459     let left_len = left.len();
460     let right_len = right.len();
461 
462     // Intermediate state of the merge process, which serves two purposes:
463     // 1. Protects integrity of `dest` from panics in `is_less`.
464     // 2. Copies the remaining elements as soon as one of the two sides is exhausted.
465     //
466     // Panic safety:
467     //
468     // If `is_less` panics at any point during the merge process, `s` will get dropped and copy the
469     // remaining parts of `left` and `right` into `dest`.
470     let mut s = State {
471         left_start: left.as_mut_ptr(),
472         left_end: left.as_mut_ptr().add(left_len),
473         right_start: right.as_mut_ptr(),
474         right_end: right.as_mut_ptr().add(right_len),
475         dest,
476     };
477 
478     if left_len == 0 || right_len == 0 || left_len + right_len < MAX_SEQUENTIAL {
479         while s.left_start < s.left_end && s.right_start < s.right_end {
480             // Consume the lesser side.
481             // If equal, prefer the left run to maintain stability.
482             let to_copy = if is_less(&*s.right_start, &*s.left_start) {
483                 get_and_increment(&mut s.right_start)
484             } else {
485                 get_and_increment(&mut s.left_start)
486             };
487             ptr::copy_nonoverlapping(to_copy, get_and_increment(&mut s.dest), 1);
488         }
489     } else {
490         // Function `split_for_merge` might panic. If that happens, `s` will get destructed and copy
491         // the whole `left` and `right` into `dest`.
492         let (left_mid, right_mid) = split_for_merge(left, right, is_less);
493         let (left_l, left_r) = left.split_at_mut(left_mid);
494         let (right_l, right_r) = right.split_at_mut(right_mid);
495 
496         // Prevent the destructor of `s` from running. Rayon will ensure that both calls to
497         // `par_merge` happen. If one of the two calls panics, they will ensure that elements still
498         // get copied into `dest_left` and `dest_right``.
499         mem::forget(s);
500 
501         // Convert the pointers to `usize` because `*mut T` is not `Send`.
502         let dest_l = dest as usize;
503         let dest_r = dest.add(left_l.len() + right_l.len()) as usize;
504         rayon_core::join(
505             || par_merge(left_l, right_l, dest_l as *mut T, is_less),
506             || par_merge(left_r, right_r, dest_r as *mut T, is_less),
507         );
508     }
509     // Finally, `s` gets dropped if we used sequential merge, thus copying the remaining elements
510     // all at once.
511 
512     // When dropped, copies arrays `left_start..left_end` and `right_start..right_end` into `dest`,
513     // in that order.
514     struct State<T> {
515         left_start: *mut T,
516         left_end: *mut T,
517         right_start: *mut T,
518         right_end: *mut T,
519         dest: *mut T,
520     }
521 
522     impl<T> Drop for State<T> {
523         fn drop(&mut self) {
524             let size = size_of::<T>();
525             let left_len = (self.left_end as usize - self.left_start as usize) / size;
526             let right_len = (self.right_end as usize - self.right_start as usize) / size;
527 
528             // Copy array `left`, followed by `right`.
529             unsafe {
530                 ptr::copy_nonoverlapping(self.left_start, self.dest, left_len);
531                 self.dest = self.dest.add(left_len);
532                 ptr::copy_nonoverlapping(self.right_start, self.dest, right_len);
533             }
534         }
535     }
536 }
537 
538 /// Recursively merges pre-sorted chunks inside `v`.
539 ///
540 /// Chunks of `v` are stored in `chunks` as intervals (inclusive left and exclusive right bound).
541 /// Argument `buf` is an auxiliary buffer that will be used during the procedure.
542 /// If `into_buf` is true, the result will be stored into `buf`, otherwise it will be in `v`.
543 ///
544 /// # Safety
545 ///
546 /// The number of chunks must be positive and they must be adjacent: the right bound of each chunk
547 /// must equal the left bound of the following chunk.
548 ///
549 /// The buffer must be at least as long as `v`.
recurse<T, F>( v: *mut T, buf: *mut T, chunks: &[(usize, usize)], into_buf: bool, is_less: &F, ) where T: Send, F: Fn(&T, &T) -> bool + Sync,550 unsafe fn recurse<T, F>(
551     v: *mut T,
552     buf: *mut T,
553     chunks: &[(usize, usize)],
554     into_buf: bool,
555     is_less: &F,
556 ) where
557     T: Send,
558     F: Fn(&T, &T) -> bool + Sync,
559 {
560     let len = chunks.len();
561     debug_assert!(len > 0);
562 
563     // Base case of the algorithm.
564     // If only one chunk is remaining, there's no more work to split and merge.
565     if len == 1 {
566         if into_buf {
567             // Copy the chunk from `v` into `buf`.
568             let (start, end) = chunks[0];
569             let src = v.add(start);
570             let dest = buf.add(start);
571             ptr::copy_nonoverlapping(src, dest, end - start);
572         }
573         return;
574     }
575 
576     // Split the chunks into two halves.
577     let (start, _) = chunks[0];
578     let (mid, _) = chunks[len / 2];
579     let (_, end) = chunks[len - 1];
580     let (left, right) = chunks.split_at(len / 2);
581 
582     // After recursive calls finish we'll have to merge chunks `(start, mid)` and `(mid, end)` from
583     // `src` into `dest`. If the current invocation has to store the result into `buf`, we'll
584     // merge chunks from `v` into `buf`, and viceversa.
585     //
586     // Recursive calls flip `into_buf` at each level of recursion. More concretely, `par_merge`
587     // merges chunks from `buf` into `v` at the first level, from `v` into `buf` at the second
588     // level etc.
589     let (src, dest) = if into_buf { (v, buf) } else { (buf, v) };
590 
591     // Panic safety:
592     //
593     // If `is_less` panics at any point during the recursive calls, the destructor of `guard` will
594     // be executed, thus copying everything from `src` into `dest`. This way we ensure that all
595     // chunks are in fact copied into `dest`, even if the merge process doesn't finish.
596     let guard = CopyOnDrop {
597         src: src.add(start),
598         dest: dest.add(start),
599         len: end - start,
600     };
601 
602     // Convert the pointers to `usize` because `*mut T` is not `Send`.
603     let v = v as usize;
604     let buf = buf as usize;
605     rayon_core::join(
606         || recurse(v as *mut T, buf as *mut T, left, !into_buf, is_less),
607         || recurse(v as *mut T, buf as *mut T, right, !into_buf, is_less),
608     );
609 
610     // Everything went all right - recursive calls didn't panic.
611     // Forget the guard in order to prevent its destructor from running.
612     mem::forget(guard);
613 
614     // Merge chunks `(start, mid)` and `(mid, end)` from `src` into `dest`.
615     let src_left = slice::from_raw_parts_mut(src.add(start), mid - start);
616     let src_right = slice::from_raw_parts_mut(src.add(mid), end - mid);
617     par_merge(src_left, src_right, dest.add(start), is_less);
618 }
619 
620 /// Sorts `v` using merge sort in parallel.
621 ///
622 /// The algorithm is stable, allocates memory, and `O(n log n)` worst-case.
623 /// The allocated temporary buffer is of the same length as is `v`.
par_mergesort<T, F>(v: &mut [T], is_less: F) where T: Send, F: Fn(&T, &T) -> bool + Sync,624 pub(super) fn par_mergesort<T, F>(v: &mut [T], is_less: F)
625 where
626     T: Send,
627     F: Fn(&T, &T) -> bool + Sync,
628 {
629     // Slices of up to this length get sorted using insertion sort in order to avoid the cost of
630     // buffer allocation.
631     const MAX_INSERTION: usize = 20;
632     // The length of initial chunks. This number is as small as possible but so that the overhead
633     // of Rayon's task scheduling is still negligible.
634     const CHUNK_LENGTH: usize = 2000;
635 
636     // Sorting has no meaningful behavior on zero-sized types.
637     if size_of::<T>() == 0 {
638         return;
639     }
640 
641     let len = v.len();
642 
643     // Short slices get sorted in-place via insertion sort to avoid allocations.
644     if len <= MAX_INSERTION {
645         if len >= 2 {
646             for i in (0..len - 1).rev() {
647                 insert_head(&mut v[i..], &is_less);
648             }
649         }
650         return;
651     }
652 
653     // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
654     // shallow copies of the contents of `v` without risking the dtors running on copies if
655     // `is_less` panics.
656     let mut buf = Vec::<T>::with_capacity(len);
657     let buf = buf.as_mut_ptr();
658 
659     // If the slice is not longer than one chunk would be, do sequential merge sort and return.
660     if len <= CHUNK_LENGTH {
661         let res = unsafe { mergesort(v, buf, &is_less) };
662         if res == MergesortResult::Descending {
663             v.reverse();
664         }
665         return;
666     }
667 
668     // Split the slice into chunks and merge sort them in parallel.
669     // However, descending chunks will not be sorted - they will be simply left intact.
670     let mut iter = {
671         // Convert the pointer to `usize` because `*mut T` is not `Send`.
672         let buf = buf as usize;
673 
674         v.par_chunks_mut(CHUNK_LENGTH)
675             .with_max_len(1)
676             .enumerate()
677             .map(|(i, chunk)| {
678                 let l = CHUNK_LENGTH * i;
679                 let r = l + chunk.len();
680                 unsafe {
681                     let buf = (buf as *mut T).add(l);
682                     (l, r, mergesort(chunk, buf, &is_less))
683                 }
684             })
685             .collect::<Vec<_>>()
686             .into_iter()
687             .peekable()
688     };
689 
690     // Now attempt to concatenate adjacent chunks that were left intact.
691     let mut chunks = Vec::with_capacity(iter.len());
692 
693     while let Some((a, mut b, res)) = iter.next() {
694         // If this chunk was not modified by the sort procedure...
695         if res != MergesortResult::Sorted {
696             while let Some(&(x, y, r)) = iter.peek() {
697                 // If the following chunk is of the same type and can be concatenated...
698                 if r == res && (r == MergesortResult::Descending) == is_less(&v[x], &v[x - 1]) {
699                     // Concatenate them.
700                     b = y;
701                     iter.next();
702                 } else {
703                     break;
704                 }
705             }
706         }
707 
708         // Descending chunks must be reversed.
709         if res == MergesortResult::Descending {
710             v[a..b].reverse();
711         }
712 
713         chunks.push((a, b));
714     }
715 
716     // All chunks are properly sorted.
717     // Now we just have to merge them together.
718     unsafe {
719         recurse(v.as_mut_ptr(), buf, &chunks, false, &is_less);
720     }
721 }
722 
723 #[cfg(test)]
724 mod tests {
725     use super::split_for_merge;
726     use rand::distributions::Uniform;
727     use rand::{thread_rng, Rng};
728 
729     #[test]
test_split_for_merge()730     fn test_split_for_merge() {
731         fn check(left: &[u32], right: &[u32]) {
732             let (l, r) = split_for_merge(left, right, &|&a, &b| a < b);
733             assert!(left[..l]
734                 .iter()
735                 .all(|&x| right[r..].iter().all(|&y| x <= y)));
736             assert!(right[..r].iter().all(|&x| left[l..].iter().all(|&y| x < y)));
737         }
738 
739         check(&[1, 2, 2, 2, 2, 3], &[1, 2, 2, 2, 2, 3]);
740         check(&[1, 2, 2, 2, 2, 3], &[]);
741         check(&[], &[1, 2, 2, 2, 2, 3]);
742 
743         let mut rng = thread_rng();
744 
745         for _ in 0..100 {
746             let limit: u32 = rng.gen_range(1, 21);
747             let left_len: usize = rng.gen_range(0, 20);
748             let right_len: usize = rng.gen_range(0, 20);
749 
750             let mut left = rng
751                 .sample_iter(&Uniform::new(0, limit))
752                 .take(left_len)
753                 .collect::<Vec<_>>();
754             let mut right = rng
755                 .sample_iter(&Uniform::new(0, limit))
756                 .take(right_len)
757                 .collect::<Vec<_>>();
758 
759             left.sort();
760             right.sort();
761             check(&left, &right);
762         }
763     }
764 }
765