1 /*******************************************************************************
2 * tlx/algorithm/multisequence_selection.hpp
3 *
4 * Implementation of multisequence partition and selection.
5 *
6 * Copied and modified from STXXL, see http://stxxl.org, which itself extracted
7 * it from MCSTL http://algo2.iti.uni-karlsruhe.de/singler/mcstl/. Both are
8 * distributed under the Boost Software License, Version 1.0.
9 *
10 * Part of tlx - http://panthema.net/tlx
11 *
12 * Copyright (C) 2007 Johannes Singler <singler@ira.uka.de>
13 * Copyright (C) 2014-2018 Timo Bingmann <tb@panthema.net>
14 *
15 * All rights reserved. Published under the Boost Software License, Version 1.0
16 ******************************************************************************/
17
18 #ifndef TLX_ALGORITHM_MULTISEQUENCE_SELECTION_HEADER
19 #define TLX_ALGORITHM_MULTISEQUENCE_SELECTION_HEADER
20
21 #include <algorithm>
22 #include <cassert>
23 #include <queue>
24 #include <utility>
25 #include <vector>
26
27 #include <tlx/container/simple_vector.hpp>
28 #include <tlx/math/round_to_power_of_two.hpp>
29
30 namespace tlx {
31
32 //! \addtogroup tlx_algorithm
33 //! \{
34
35 namespace multisequence_selection_detail {
36
37 //! Compare a pair of types lexicographically, ascending.
38 template <typename T1, typename T2, typename Comparator>
39 class lexicographic
40 {
41 protected:
42 Comparator& comp_;
43
44 public:
lexicographic(Comparator & comp)45 explicit lexicographic(Comparator& comp) : comp_(comp) { }
46
operator ()(const std::pair<T1,T2> & p1,const std::pair<T1,T2> & p2)47 inline bool operator () (const std::pair<T1, T2>& p1,
48 const std::pair<T1, T2>& p2) {
49 if (comp_(p1.first, p2.first))
50 return true;
51
52 if (comp_(p2.first, p1.first))
53 return false;
54
55 // firsts are equal
56 return p1.second < p2.second;
57 }
58 };
59
60 //! Compare a pair of types lexicographically, descending.
61 template <typename T1, typename T2, typename Comparator>
62 class lexicographic_rev
63 {
64 protected:
65 Comparator& comp_;
66
67 public:
lexicographic_rev(Comparator & comp)68 explicit lexicographic_rev(Comparator& comp) : comp_(comp) { }
69
operator ()(const std::pair<T1,T2> & p1,const std::pair<T1,T2> & p2)70 inline bool operator () (const std::pair<T1, T2>& p1,
71 const std::pair<T1, T2>& p2) {
72 if (comp_(p2.first, p1.first))
73 return true;
74
75 if (comp_(p1.first, p2.first))
76 return false;
77
78 // firsts are equal
79 return p2.second < p1.second;
80 }
81 };
82
83 } // namespace multisequence_selection_detail
84
85 /*!
86 * Selects the element at a certain global rank from several sorted sequences.
87 *
88 * The sequences are passed via a sequence of random-access iterator pairs, none
89 * of the sequences may be empty.
90 *
91 * \param begin_seqs Begin of the sequence of iterator pairs.
92 * \param end_seqs End of the sequence of iterator pairs.
93 * \param rank The global rank to partition at.
94 * \param offset The rank of the selected element in the global subsequence of
95 * elements equal to the selected element. If the selected element is unique,
96 * this number is 0.
97 * \param comp The ordering functor, defaults to std::less. */
98 template <typename ValueType, typename RanSeqs, typename RankType,
99 typename Comparator = std::less<ValueType> >
multisequence_selection(const RanSeqs & begin_seqs,const RanSeqs & end_seqs,const RankType & rank,RankType & offset,Comparator comp=Comparator ())100 ValueType multisequence_selection(
101 const RanSeqs& begin_seqs, const RanSeqs& end_seqs,
102 const RankType& rank,
103 RankType& offset,
104 Comparator comp = Comparator()) {
105
106 using iterator = typename std::iterator_traits<RanSeqs>
107 ::value_type::first_type;
108 using diff_type = typename std::iterator_traits<iterator>
109 ::difference_type;
110
111 using SamplePair = std::pair<ValueType, diff_type>;
112
113 using namespace multisequence_selection_detail;
114
115 // comparators for SamplePair
116 lexicographic<ValueType, diff_type, Comparator> lcomp(comp);
117 lexicographic_rev<ValueType, diff_type, Comparator> lrcomp(comp);
118
119 // number of sequences, number of elements in total (possibly including
120 // padding)
121 const diff_type m = std::distance(begin_seqs, end_seqs);
122 diff_type nmax, n;
123 RankType N = 0;
124
125 for (diff_type i = 0; i < m; ++i)
126 N += std::distance(begin_seqs[i].first, begin_seqs[i].second);
127
128 if (m == 0 || N == 0 || rank < 0 || rank >= N)
129 // result undefined when there is no data or rank is outside bounds
130 throw std::exception();
131
132 simple_vector<diff_type> seqlen(m);
133
134 seqlen[0] = std::distance(begin_seqs[0].first, begin_seqs[0].second);
135 nmax = seqlen[0];
136 for (diff_type i = 1; i < m; ++i)
137 {
138 seqlen[i] = std::distance(begin_seqs[i].first, begin_seqs[i].second);
139 nmax = std::max(nmax, seqlen[i]);
140 }
141
142 // pad all lists to this length, at least as long as any ns[i], equliaty iff
143 // nmax = 2^k - 1
144 diff_type l = round_up_to_power_of_two(nmax + 1) - 1;
145
146 simple_vector<diff_type> a(m), b(m);
147
148 for (diff_type i = 0; i < m; ++i)
149 a[i] = 0, b[i] = l;
150
151 n = l / 2;
152
153 // invariants:
154 // 0 <= a[i] <= seqlen[i], 0 <= b[i] <= l
155
156 // initial partition
157
158 std::vector<SamplePair> sample;
159
160 for (diff_type i = 0; i < m; ++i) {
161 if (n < seqlen[i])
162 sample.push_back(SamplePair(begin_seqs[i].first[n], i));
163 }
164
165 std::sort(sample.begin(), sample.end(), lcomp);
166
167 for (diff_type i = 0; i < m; ++i) {
168 // conceptual infinity
169 if (n >= seqlen[i])
170 sample.push_back(
171 SamplePair(begin_seqs[i].first[0] /* dummy element */, i));
172 }
173
174 size_t localrank = static_cast<size_t>(rank / l);
175
176 size_t j;
177 for (j = 0; j < localrank && ((n + 1) <= seqlen[sample[j].second]); ++j)
178 a[sample[j].second] += n + 1;
179 for ( ; j < static_cast<size_t>(m); ++j)
180 b[sample[j].second] -= n + 1;
181
182 // further refinement
183
184 while (n > 0)
185 {
186 n /= 2;
187
188 const ValueType* lmax = nullptr;
189 for (diff_type i = 0; i < m; ++i)
190 {
191 if (a[i] > 0)
192 {
193 if (!lmax)
194 {
195 lmax = &(begin_seqs[i].first[a[i] - 1]);
196 }
197 else
198 {
199 // max
200 if (comp(*lmax, begin_seqs[i].first[a[i] - 1]))
201 lmax = &(begin_seqs[i].first[a[i] - 1]);
202 }
203 }
204 }
205
206 for (diff_type i = 0; i < m; ++i)
207 {
208 diff_type middle = (b[i] + a[i]) / 2;
209 if (lmax && middle < seqlen[i] &&
210 comp(begin_seqs[i].first[middle], *lmax))
211 a[i] = std::min(a[i] + n + 1, seqlen[i]);
212 else
213 b[i] -= n + 1;
214 }
215
216 diff_type leftsize = 0;
217 for (diff_type i = 0; i < m; ++i)
218 leftsize += a[i] / (n + 1);
219
220 diff_type skew = rank / (n + 1) - leftsize;
221
222 if (skew > 0)
223 {
224 // move to the left, find smallest
225 std::priority_queue<
226 SamplePair, std::vector<SamplePair>,
227 lexicographic_rev<ValueType, diff_type, Comparator> >
228 pq(lrcomp);
229
230 for (diff_type i = 0; i < m; ++i) {
231 if (b[i] < seqlen[i])
232 pq.push(SamplePair(begin_seqs[i].first[b[i]], i));
233 }
234
235 for ( ; skew != 0 && !pq.empty(); --skew)
236 {
237 diff_type src = pq.top().second;
238 pq.pop();
239
240 a[src] = std::min(a[src] + n + 1, seqlen[src]);
241 b[src] += n + 1;
242
243 if (b[src] < seqlen[src])
244 pq.push(SamplePair(begin_seqs[src].first[b[src]], src));
245 }
246 }
247 else if (skew < 0)
248 {
249 // move to the right, find greatest
250 std::priority_queue<
251 SamplePair, std::vector<SamplePair>,
252 lexicographic<ValueType, diff_type, Comparator> >
253 pq(lcomp);
254
255 for (diff_type i = 0; i < m; ++i) {
256 if (a[i] > 0)
257 pq.push(SamplePair(begin_seqs[i].first[a[i] - 1], i));
258 }
259
260 for ( ; skew != 0; ++skew)
261 {
262 diff_type src = pq.top().second;
263 pq.pop();
264
265 a[src] -= n + 1;
266 b[src] -= n + 1;
267
268 if (a[src] > 0)
269 pq.push(SamplePair(begin_seqs[src].first[a[src] - 1], src));
270 }
271 }
272 }
273
274 // postconditions: a[i] == b[i] in most cases, except when a[i] has been
275 // clamped because of having reached the boundary
276
277 // now return the result, calculate the offset, compare the keys on both
278 // edges of the border
279
280 // maximum of left edge, minimum of right edge
281 ValueType* maxleft = nullptr, * minright = nullptr;
282 for (diff_type i = 0; i < m; ++i)
283 {
284 if (a[i] > 0)
285 {
286 if (!maxleft)
287 {
288 maxleft = &(begin_seqs[i].first[a[i] - 1]);
289 }
290 else
291 {
292 // max
293 if (comp(*maxleft, begin_seqs[i].first[a[i] - 1]))
294 maxleft = &(begin_seqs[i].first[a[i] - 1]);
295 }
296 }
297 if (b[i] < seqlen[i])
298 {
299 if (!minright)
300 {
301 minright = &(begin_seqs[i].first[b[i]]);
302 }
303 else
304 {
305 // min
306 if (comp(begin_seqs[i].first[b[i]], *minright))
307 minright = &(begin_seqs[i].first[b[i]]);
308 }
309 }
310 }
311
312 // minright is the splitter, in any case
313
314 if (!maxleft || comp(*minright, *maxleft))
315 {
316 // good luck, everything is split unambigiously
317 offset = 0;
318 }
319 else
320 {
321 // we have to calculate an offset
322 offset = 0;
323
324 for (diff_type i = 0; i < m; ++i)
325 {
326 diff_type lb = std::lower_bound(
327 begin_seqs[i].first, begin_seqs[i].first + seqlen[i],
328 *minright, comp) - begin_seqs[i].first;
329 offset += a[i] - lb;
330 }
331 }
332
333 return *minright;
334 }
335
336 //! \}
337
338 } // namespace tlx
339
340 #endif // !TLX_ALGORITHM_MULTISEQUENCE_SELECTION_HEADER
341
342 /******************************************************************************/
343