1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 2.0
3 *
4 * This Source Code Form is subject to the terms of the Mozilla Public
5 * License, v. 2.0. If a copy of the MPL was not distributed with this
6 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 *
8 * Software distributed under the License is distributed on an "AS IS" basis,
9 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
10 * for the specific language governing rights and limitations under the
11 * License.
12 *
13 * The Original Code is the MSufSort suffix sorting algorithm (Version 2.2).
14 *
15 * The Initial Developer of the Original Code is
16 * Michael A. Maniscalco
17 * Portions created by the Initial Developer are Copyright (C) 2006
18 * the Initial Developer. All Rights Reserved.
19 *
20 * Contributor(s):
21 *
22 * Michael A. Maniscalco
23 *
24 * ***** END LICENSE BLOCK ***** */
25
26 #ifndef MSUFSORT_H
27 #define MSUFSORT_H
28
29 //==================================================================//
30 // //
31 // v //
32 // MSufSort Version 2.2 //
33 // Author: Michael A Maniscalco //
34 // Date: Nov. 3, 2005 //
35 // //
36 // Notes: //
37 // //
38 //==================================================================//
39
40
41 #include "stdio.h"
42 #include "stack.h"
43 #include "introsort.h"
44 #include "inductionsort.h"
45
46
47 //==================================================================//
48 // Test app defines: //
49 //==================================================================//
50
51 #define SHOW_PROGRESS // display progress during sort
52 #define CHECK_SORT // verify that sorting is correct.
53 // #define SORT_16_BIT_SYMBOLS // enable 16 bit symbols.
54
55 #define USE_INDUCTION_SORTING // enable induction sorting feature.
56 #define USE_ENHANCED_INDUCTION_SORTING // enable enhanced induction sorting feature.
57 #define USE_TANDEM_REPEAT_SORTING // enable the tandem repeat sorting feature.
58 //#define USE_ALT_SORT_ORDER // enable alternative sorting order
59
60
61 #define ENDIAN_SWAP_16(value) ((value >> 8) | (value << 8))
62
63 #define SUFFIX_SORTED 0x80000000 // flag marks suffix as sorted.
64 #define END_OF_CHAIN 0x3ffffffe // marks the end of a chain
65 #define SORTED_BY_ENHANCED_INDUCTION 0x3fffffff // marks suffix which will be sorted by enhanced induction sort.
66
67
68
69 #ifdef SORT_16_BIT_SYMBOLS
70 #define SYMBOL_TYPE unsigned short
71 #else
72 #define SYMBOL_TYPE unsigned char
73 #endif
74
75 class MSufSort
76 {
77 public:
78 MSufSort();
79
80 virtual ~MSufSort();
81
82 unsigned int Sort(SYMBOL_TYPE * source, unsigned int sourceLength);
83
84 unsigned int GetElapsedSortTime();
85
86 unsigned int GetMemoryUsage();
87
88 unsigned int ISA(unsigned int index);
89
90 bool VerifySort();
91
92 static void ReverseAltSortOrder(SYMBOL_TYPE * data, unsigned int nBytes);
93
94
95 private:
96 int CompareStrings(SYMBOL_TYPE * stringA, SYMBOL_TYPE * stringB, int len);
97
98 bool IsTandemRepeat2();
99
100 bool IsTandemRepeat();
101
102 void PassTandemRepeat();
103
104 bool IsSortedByInduction();
105
106 bool IsSortedByEnhancedInduction(unsigned int suffixIndex);
107
108 void ProcessSuffixesSortedByInduction();
109
110 // MarkSuffixAsSorted
111 // Sets the final inverse suffix array value for a given suffix.
112 // Also invokes the OnSortedSuffix member function.
113 void MarkSuffixAsSorted(unsigned int suffixIndex, unsigned int & sortedIndex);
114 void MarkSuffixAsSorted2(unsigned int suffixIndex, unsigned int & sortedIndex);
115
116 void MarkSuffixAsSortedByEnhancedInductionSort(unsigned int suffixIndex);
117
118 // PushNewChainsOntoStack:
119 // Moves all new suffix chains onto the stack of partially sorted
120 // suffixes. (makes them ready for further sub sorting).
121 void PushNewChainsOntoStack(bool originalChains = false);
122
123 void PushTandemBypassesOntoStack();
124
125 // OnSortedSuffix:
126 // Event which is invoked with each sorted suffix at the time of
127 // its sorting.
128 virtual void OnSortedSuffix(unsigned int suffixIndex);
129
130 // Initialize:
131 // Initializes this object just before sorting begins.
132 void Initialize();
133
134 // InitialSort:
135 // This is the first sorting pass which makes the initial suffix
136 // chains from the given source string. Pushes these chains onto
137 // the stack for further sorting.
138 void InitialSort();
139
140 // Value16:
141 // Returns the two 8 bit symbols located
142 // at positions N and N + 1 where N = the sourceIndex.
143 unsigned short Value16(unsigned int sourceIndex);
144
145 // ProcessChain:
146 // Sorts the suffixes of a given chain by the next two symbols of
147 // each suffix in the chain. This creates zero or more new suffix
148 // chains with each sorted by two more symbols than the original
149 // chain. Then pushes these new chains onto the chain stack for
150 // further sorting.
151 void ProcessNextChain();
152
153 void AddToSuffixChain(unsigned int suffixIndex, unsigned short suffixChain);
154
155 void AddToSuffixChain(unsigned int firstSuffixIndex, unsigned int lastSuffixIndex, unsigned short suffixChain);
156
157 void ProcessSuffixesSortedByEnhancedInduction(unsigned short suffixId);
158
159 void ResolveTandemRepeatsNotSortedWithInduction();
160
161 unsigned int m_sortTime;
162
163 Stack<unsigned int> m_chainMatchLengthStack;
164
165 Stack<int> m_chainCountStack;
166
167 Stack<unsigned int> m_chainHeadStack;
168
169 unsigned int m_endOfSuffixChain[0x10000];
170
171 unsigned int m_startOfSuffixChain[0x10000];
172
173 // m_source:
174 // Address of the string to sort.
175 SYMBOL_TYPE * m_source;
176
177 // m_sourceLength:
178 // The length of the string pointed to by m_source.
179 unsigned int m_sourceLength;
180
181 unsigned int m_sourceLengthMinusOne;
182
183 // m_ISA:
184 // The address of the working space which, when the sort is
185 // completed, will contain the inverse suffix array for the
186 // source string.
187 unsigned int * m_ISA;
188
189 // m_nextSortedSuffixValue:
190 unsigned int m_nextSortedSuffixValue;
191
192 //
193 unsigned int m_numSortedSuffixes;
194
195 // m_newChainIds
196 // Array containing the valid chain numbers in m_newChain array.
197 unsigned short m_newChainIds[0x10000];
198
199 // m_numNewChains:
200 // The number of new suffix chain ids stored in m_numChainIds.
201 unsigned int m_numNewChains;
202
203 Stack<InductionSortObject> m_suffixesSortedByInduction;
204
205 unsigned int m_suffixMatchLength;
206
207 unsigned int m_currentSuffixIndex;
208
209 // m_firstSortedPosition:
210 // For use with enhanced induction sorting.
211 unsigned int m_firstSortedPosition[0x10000];
212
213 unsigned int m_firstSuffixByEnhancedInductionSort[0x10000];
214
215 unsigned int m_lastSuffixByEnhancedInductionSort[0x10000];
216
217 unsigned int m_currentSuffixChainId;
218
219 #ifdef SHOW_PROGRESS
220 // ShowProgress:
221 // Update the progress indicator.
222 void ShowProgress();
223
224 // m_nextProgressUpdate:
225 // Indicates when to update the progress indicator.
226 unsigned int m_nextProgressUpdate;
227
228 // m_progressUpdateIncrement:
229 // Indicates how many suffixes should be sorted before
230 // incrementing the progress indicator.
231 unsigned int m_progressUpdateIncrement;
232 #endif
233
234
235 // members used if alternate sorting order should be applied.
236 SYMBOL_TYPE m_forwardAltSortOrder[256];
237
238 static SYMBOL_TYPE m_reverseAltSortOrder[256];
239
240 // for tandem repeat sorting
241 bool m_hasTandemRepeatSortedByInduction;
242
243 unsigned int m_firstUnsortedTandemRepeat;
244
245 unsigned int m_lastUnsortedTandemRepeat;
246
247 bool m_hasEvenLengthTandemRepeats;
248
249 unsigned int m_tandemRepeatDepth;
250
251 unsigned int m_firstSortedTandemRepeat;
252
253 unsigned int m_lastSortedTandemRepeat;
254
255 unsigned int m_tandemRepeatLength;
256 };
257
258
259
260
261
262 //inline unsigned short MSufSort::Value16(unsigned int sourceIndex)
263 //{
264 // return (sourceIndex < m_sourceLengthMinusOne) ? *(unsigned short *)(m_source + sourceIndex) : m_source[sourceIndex];
265 //}
266
267 // fix by Brian Ripley
Value16(unsigned int sourceIndex)268 inline unsigned short MSufSort::Value16(unsigned int sourceIndex)
269 {
270 union {unsigned short u; unsigned char b[2];} u16;
271 u16.b[0] = m_source[sourceIndex];
272 u16.b[1] = (sourceIndex < m_sourceLengthMinusOne) ?
273 m_source[sourceIndex + 1] : 0;
274 return u16.u;
275 }
276
277
278
279
280
IsSortedByInduction()281 inline bool MSufSort::IsSortedByInduction()
282 {
283 unsigned int n = m_currentSuffixIndex + m_suffixMatchLength - 1;
284
285 #ifndef USE_INDUCTION_SORTING
286 if (n < m_sourceLengthMinusOne)
287 return false;
288 #endif
289
290 if ((m_ISA[n] & SUFFIX_SORTED) && ((m_ISA[n] & 0x3fffffff) < m_nextSortedSuffixValue))
291 {
292 InductionSortObject i(0, m_ISA[n], m_currentSuffixIndex);
293 m_suffixesSortedByInduction.Push(i);
294 }
295 else
296 if ((m_ISA[n + 1] & SUFFIX_SORTED) && ((m_ISA[n + 1] & 0x3fffffff) < m_nextSortedSuffixValue))
297 {
298 InductionSortObject i(1, m_ISA[n + 1], m_currentSuffixIndex);
299 m_suffixesSortedByInduction.Push(i);
300 }
301 else
302 return false;
303
304 return true;
305 }
306
307
308
309
310
311
312
313
IsSortedByEnhancedInduction(unsigned int suffixIndex)314 inline bool MSufSort::IsSortedByEnhancedInduction(unsigned int suffixIndex)
315 {
316 if (suffixIndex > 0)
317 if (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION)
318 return true;
319 return false;
320 }
321
322
323
324
325
326
IsTandemRepeat()327 inline bool MSufSort::IsTandemRepeat()
328 {
329 #ifndef USE_TANDEM_REPEAT_SORTING
330 return false;
331 #else
332 if ((!m_tandemRepeatDepth) && (m_currentSuffixIndex + m_suffixMatchLength) == (m_ISA[m_currentSuffixIndex] + 1))
333 return true;
334
335 #ifndef SORT_16_BIT_SYMBOLS
336 if ((!m_tandemRepeatDepth) && ((m_currentSuffixIndex + m_suffixMatchLength) == (m_ISA[m_currentSuffixIndex])))
337 {
338 m_hasEvenLengthTandemRepeats = true;
339 return false;
340 }
341 #endif
342
343 return false;
344 #endif
345 }
346
347
348
349
350
351
352
PassTandemRepeat()353 inline void MSufSort::PassTandemRepeat()
354 {
355 unsigned int nextIndex;
356 unsigned int lastIndex;
357 // unsigned int firstIndex = m_currentSuffixIndex;
358
359 while ((m_currentSuffixIndex + m_suffixMatchLength) == ((nextIndex = m_ISA[m_currentSuffixIndex]) + 1))
360 {
361 lastIndex = m_currentSuffixIndex;
362 m_currentSuffixIndex = nextIndex;
363 }
364
365 if (IsSortedByInduction())
366 {
367 m_hasTandemRepeatSortedByInduction = true;
368 m_currentSuffixIndex = m_ISA[m_currentSuffixIndex];
369 }
370 else
371 {
372 if (m_firstUnsortedTandemRepeat == END_OF_CHAIN)
373 m_firstUnsortedTandemRepeat = m_lastUnsortedTandemRepeat = lastIndex;
374 else
375 m_lastUnsortedTandemRepeat = (m_ISA[m_lastUnsortedTandemRepeat] = lastIndex);
376 }
377 }
378
379
380
381
382
383
PushNewChainsOntoStack(bool originalChains)384 inline void MSufSort::PushNewChainsOntoStack(bool originalChains)
385 {
386 // Moves all new suffix chains onto the stack of partially sorted
387 // suffixes. (makes them ready for further sub sorting).
388 #ifdef SORT_16_BIT_SYMBOLS
389 unsigned int newSuffixMatchLength = m_suffixMatchLength + 1;
390 #else
391 unsigned int newSuffixMatchLength = m_suffixMatchLength + 2;
392 #endif
393
394 if (m_numNewChains)
395 {
396 if (m_hasEvenLengthTandemRepeats)
397 {
398 m_chainCountStack.Push(m_numNewChains - 1);
399 m_chainMatchLengthStack.Push(newSuffixMatchLength);
400 m_chainCountStack.Push(1);
401 m_chainMatchLengthStack.Push(newSuffixMatchLength - 1);
402 }
403 else
404 {
405 m_chainCountStack.Push(m_numNewChains);
406 m_chainMatchLengthStack.Push(newSuffixMatchLength);
407 }
408
409 if (m_numNewChains > 1)
410 IntroSort(m_newChainIds, m_numNewChains);
411
412 while (m_numNewChains)
413 {
414 unsigned short chainId = m_newChainIds[--m_numNewChains];
415 chainId = ENDIAN_SWAP_16(chainId);
416 // unsigned int n = m_startOfSuffixChain[chainId];
417 m_chainHeadStack.Push(m_startOfSuffixChain[chainId]);
418 m_startOfSuffixChain[chainId] = END_OF_CHAIN;
419 m_ISA[m_endOfSuffixChain[chainId]] = END_OF_CHAIN;
420 }
421 }
422 m_hasEvenLengthTandemRepeats = false;
423
424 if (m_firstUnsortedTandemRepeat != END_OF_CHAIN)
425 {
426 // Tandem repeats with a terminating suffix that did not get
427 // sorted via induction has occurred (at least once).
428 // We have a suffix chain (indicated by m_firstTandemRepeatWithoutSuffix)
429 // of the suffix in each tandem repeat which immediately proceeded the
430 // terminating suffix in each chain. We want to sort them relative to
431 // each other and then process the tandem repeats.
432 unsigned int tandemRepeatLength = m_suffixMatchLength - 1;
433 unsigned int numChains = m_chainHeadStack.Count();
434 m_chainHeadStack.Push(m_firstUnsortedTandemRepeat);
435 m_chainCountStack.Push(1);
436 m_chainMatchLengthStack.Push((m_suffixMatchLength << 1) - 1);
437 m_ISA[m_lastUnsortedTandemRepeat] = END_OF_CHAIN;
438 m_firstUnsortedTandemRepeat = END_OF_CHAIN;
439
440 m_tandemRepeatDepth = 1;
441 while (m_chainHeadStack.Count() > numChains)
442 ProcessNextChain();
443
444 m_suffixMatchLength = tandemRepeatLength + 1;
445 ResolveTandemRepeatsNotSortedWithInduction();
446 m_tandemRepeatDepth = 0;
447 }
448
449 }
450
451
452
453
454
455
AddToSuffixChain(unsigned int suffixIndex,unsigned short suffixChain)456 inline void MSufSort::AddToSuffixChain(unsigned int suffixIndex, unsigned short suffixChain)
457 {
458 if (m_startOfSuffixChain[suffixChain] == END_OF_CHAIN)
459 {
460 m_endOfSuffixChain[suffixChain] = m_startOfSuffixChain[suffixChain] = suffixIndex;
461 m_newChainIds[m_numNewChains++] = ENDIAN_SWAP_16(suffixChain);
462 }
463 else
464 m_endOfSuffixChain[suffixChain] = m_ISA[m_endOfSuffixChain[suffixChain]] = suffixIndex;
465 }
466
467
468
469
470
471
AddToSuffixChain(unsigned int firstSuffixIndex,unsigned int lastSuffixIndex,unsigned short suffixChain)472 inline void MSufSort::AddToSuffixChain(unsigned int firstSuffixIndex, unsigned int lastSuffixIndex, unsigned short suffixChain)
473 {
474 if (m_startOfSuffixChain[suffixChain] == END_OF_CHAIN)
475 {
476 m_startOfSuffixChain[suffixChain] = firstSuffixIndex;
477 m_endOfSuffixChain[suffixChain] = lastSuffixIndex;
478 m_newChainIds[m_numNewChains++] = ENDIAN_SWAP_16(suffixChain);
479 }
480 else
481 {
482 m_ISA[m_endOfSuffixChain[suffixChain]] = firstSuffixIndex;
483 m_endOfSuffixChain[suffixChain] = lastSuffixIndex;
484 }
485 }
486
487
488
489
490
491
492
OnSortedSuffix(unsigned int suffixIndex)493 inline void MSufSort::OnSortedSuffix(unsigned int suffixIndex)
494 {
495 // Event which is invoked with each sorted suffix at the time of
496 // its sorting.
497 m_numSortedSuffixes++;
498 #ifdef SHOW_PROGRESS
499 if (m_numSortedSuffixes >= m_nextProgressUpdate)
500 {
501 m_nextProgressUpdate += m_progressUpdateIncrement;
502 ShowProgress();
503 }
504 #endif
505 }
506
507
508
509 #ifdef SORT_16_BIT_SYMBOLS
510
MarkSuffixAsSorted(unsigned int suffixIndex,unsigned int & sortedIndex)511 inline void MSufSort::MarkSuffixAsSorted(unsigned int suffixIndex, unsigned int & sortedIndex)
512 {
513 // Sets the final inverse suffix array value for a given suffix.
514 // Also invokes the OnSortedSuffix member function.
515
516 if (m_tandemRepeatDepth)
517 {
518 // we are processing a list of suffixes which we the second to last in tandem repeats
519 // that were not terminated via induction. These suffixes are not actually to be
520 // marked as sorted yet. Instead, they are to be linked together in sorted order.
521 if (m_firstSortedTandemRepeat == END_OF_CHAIN)
522 m_firstSortedTandemRepeat = m_lastSortedTandemRepeat = suffixIndex;
523 else
524 m_lastSortedTandemRepeat = (m_ISA[m_lastSortedTandemRepeat] = suffixIndex);
525 return;
526 }
527
528 m_ISA[suffixIndex] = (sortedIndex++ | SUFFIX_SORTED);
529 #ifdef SHOW_PROGRESS
530 OnSortedSuffix(suffixIndex);
531 #endif
532
533 #ifdef USE_ENHANCED_INDUCTION_SORTING
534 if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
535 {
536 suffixIndex--;
537 unsigned short symbol = Value16(suffixIndex);
538
539 m_ISA[suffixIndex] = (m_firstSortedPosition[symbol]++ | SUFFIX_SORTED);
540 #ifdef SHOW_PROGRESS
541 OnSortedSuffix(suffixIndex);
542 #endif
543
544 if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
545 {
546 suffixIndex--;
547 symbol = ENDIAN_SWAP_16(symbol);
548 if (m_firstSuffixByEnhancedInductionSort[symbol] == END_OF_CHAIN)
549 m_firstSuffixByEnhancedInductionSort[symbol] = m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
550 else
551 {
552 m_ISA[m_lastSuffixByEnhancedInductionSort[symbol]] = suffixIndex;
553 m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
554 }
555 }
556 }
557 #endif
558 }
559
560
561
562
MarkSuffixAsSorted2(unsigned int suffixIndex,unsigned int & sortedIndex)563 inline void MSufSort::MarkSuffixAsSorted2(unsigned int suffixIndex, unsigned int & sortedIndex)
564 {
565 // Sets the final inverse suffix array value for a given suffix.
566 // Also invokes the OnSortedSuffix member function.
567
568 if (m_tandemRepeatDepth)
569 {
570 // we are processing a list of suffixes which we the second to last in tandem repeats
571 // that were not terminated via induction. These suffixes are not actually to be
572 // marked as sorted yet. Instead, they are to be linked together in sorted order.
573 if (m_firstSortedTandemRepeat == END_OF_CHAIN)
574 m_firstSortedTandemRepeat = m_lastSortedTandemRepeat = suffixIndex;
575 else
576 m_lastSortedTandemRepeat = (m_ISA[m_lastSortedTandemRepeat] = suffixIndex);
577 return;
578 }
579
580 m_ISA[suffixIndex] = (sortedIndex++ | SUFFIX_SORTED);
581 #ifdef SHOW_PROGRESS
582 OnSortedSuffix(suffixIndex);
583 #endif
584
585 #ifdef USE_ENHANCED_INDUCTION_SORTING
586 if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
587 {
588 unsigned short symbol = Value16(suffixIndex);
589 symbol = ENDIAN_SWAP_16(symbol);
590 suffixIndex--;
591 if (m_firstSuffixByEnhancedInductionSort[symbol] == END_OF_CHAIN)
592 m_firstSuffixByEnhancedInductionSort[symbol] = m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
593 else
594 {
595 m_ISA[m_lastSuffixByEnhancedInductionSort[symbol]] = suffixIndex;
596 m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
597 }
598 }
599 #endif
600 }
601
602
603 #else
604
605
MarkSuffixAsSorted(unsigned int suffixIndex,unsigned int & sortedIndex)606 inline void MSufSort::MarkSuffixAsSorted(unsigned int suffixIndex, unsigned int & sortedIndex)
607 {
608 // Sets the final inverse suffix array value for a given suffix.
609 // Also invokes the OnSortedSuffix member function.
610
611 if (m_tandemRepeatDepth)
612 {
613 // we are processing a list of suffixes which we the second to last in tandem repeats
614 // that were not terminated via induction. These suffixes are not actually to be
615 // marked as sorted yet. Instead, they are to be linked together in sorted order.
616 if (m_firstSortedTandemRepeat == END_OF_CHAIN)
617 m_firstSortedTandemRepeat = m_lastSortedTandemRepeat = suffixIndex;
618 else
619 m_lastSortedTandemRepeat = (m_ISA[m_lastSortedTandemRepeat] = suffixIndex);
620 return;
621 }
622
623 m_ISA[suffixIndex] = (sortedIndex++ | SUFFIX_SORTED);
624 #ifdef SHOW_PROGRESS
625 OnSortedSuffix(suffixIndex);
626 #endif
627
628 #ifdef USE_ENHANCED_INDUCTION_SORTING
629 if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
630 {
631 suffixIndex--;
632 unsigned short symbol = Value16(suffixIndex);
633
634 m_ISA[suffixIndex] = (m_firstSortedPosition[symbol]++ | SUFFIX_SORTED);
635 #ifdef SHOW_PROGRESS
636 OnSortedSuffix(suffixIndex);
637 #endif
638
639 if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
640 {
641 suffixIndex--;
642 unsigned short symbol2 = symbol;
643 symbol = Value16(suffixIndex);
644
645 m_ISA[suffixIndex] = (m_firstSortedPosition[symbol]++ | SUFFIX_SORTED);
646 #ifdef SHOW_PROGRESS
647 OnSortedSuffix(suffixIndex);
648 #endif
649
650 if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
651 {
652 if (m_source[suffixIndex] < m_source[suffixIndex + 1])
653 symbol2 = ENDIAN_SWAP_16(symbol);
654 else
655 symbol2 = ENDIAN_SWAP_16(symbol2);
656 suffixIndex--;
657 if (m_firstSuffixByEnhancedInductionSort[symbol2] == END_OF_CHAIN)
658 m_firstSuffixByEnhancedInductionSort[symbol2] = m_lastSuffixByEnhancedInductionSort[symbol2] = suffixIndex;
659 else
660 {
661 m_ISA[m_lastSuffixByEnhancedInductionSort[symbol2]] = suffixIndex;
662 m_lastSuffixByEnhancedInductionSort[symbol2] = suffixIndex;
663 }
664 }
665 }
666 }
667 #endif
668 }
669
670
671
672
MarkSuffixAsSorted2(unsigned int suffixIndex,unsigned int & sortedIndex)673 inline void MSufSort::MarkSuffixAsSorted2(unsigned int suffixIndex, unsigned int & sortedIndex)
674 {
675 // Sets the final inverse suffix array value for a given suffix.
676 // Also invokes the OnSortedSuffix member function.
677
678 if (m_tandemRepeatDepth)
679 {
680 // we are processing a list of suffixes which we the second to last in tandem repeats
681 // that were not terminated via induction. These suffixes are not actually to be
682 // marked as sorted yet. Instead, they are to be linked together in sorted order.
683 if (m_firstSortedTandemRepeat == END_OF_CHAIN)
684 m_firstSortedTandemRepeat = m_lastSortedTandemRepeat = suffixIndex;
685 else
686 m_lastSortedTandemRepeat = (m_ISA[m_lastSortedTandemRepeat] = suffixIndex);
687 return;
688 }
689
690 m_ISA[suffixIndex] = (sortedIndex++ | SUFFIX_SORTED);
691 #ifdef SHOW_PROGRESS
692 OnSortedSuffix(suffixIndex);
693 #endif
694
695 #ifdef USE_ENHANCED_INDUCTION_SORTING
696 if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
697 {
698 unsigned short symbol;
699 if (m_source[suffixIndex] < m_source[suffixIndex + 1])
700 symbol = Value16(suffixIndex);
701 else
702 symbol = Value16(suffixIndex + 1);
703 symbol = ENDIAN_SWAP_16(symbol);
704 suffixIndex--;
705 if (m_firstSuffixByEnhancedInductionSort[symbol] == END_OF_CHAIN)
706 m_firstSuffixByEnhancedInductionSort[symbol] = m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
707 else
708 {
709 m_ISA[m_lastSuffixByEnhancedInductionSort[symbol]] = suffixIndex;
710 m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
711 }
712 }
713 #endif
714 }
715
716 #endif
717
718
ProcessNextChain()719 inline void MSufSort::ProcessNextChain()
720 {
721 // Sorts the suffixes of a given chain by the next two symbols of
722 // each suffix in the chain. This creates zero or more new suffix
723 // chains with each sorted by two more symbols than the original
724 // chain. Then pushes these new chains onto the chain stack for
725 // further sorting.
726 while (--m_chainCountStack.Top() < 0)
727 {
728 m_chainCountStack.Pop();
729 m_chainMatchLengthStack.Pop();
730 }
731 m_suffixMatchLength = m_chainMatchLengthStack.Top();
732 m_currentSuffixIndex = m_chainHeadStack.Pop();
733
734 #ifdef USE_ENHANCED_INDUCTION_SORTING
735 if (m_chainMatchLengthStack.Count() == 1)
736 {
737 // one of the original buckets from InitialSort(). This is important
738 // when enhanced induction sorting is enabled.
739 unsigned short chainId = Value16(m_currentSuffixIndex);
740 unsigned short temp = chainId;
741 chainId = ENDIAN_SWAP_16(chainId);
742 while (m_currentSuffixChainId <= chainId)
743 ProcessSuffixesSortedByEnhancedInduction(m_currentSuffixChainId++);
744 m_nextSortedSuffixValue = m_firstSortedPosition[temp];
745 }
746 #endif
747
748 if (m_ISA[m_currentSuffixIndex] == END_OF_CHAIN)
749 MarkSuffixAsSorted(m_currentSuffixIndex, m_nextSortedSuffixValue); // only one suffix in bucket so it is sorted.
750 else
751 {
752 do
753 {
754 if (IsTandemRepeat())
755 PassTandemRepeat();
756 else
757 if ((m_currentSuffixIndex != END_OF_CHAIN) && (IsSortedByInduction()))
758 m_currentSuffixIndex = m_ISA[m_currentSuffixIndex];
759 else
760 {
761 unsigned int firstSuffixIndex = m_currentSuffixIndex;
762 unsigned int lastSuffixIndex = m_currentSuffixIndex;
763 unsigned short targetSymbol = Value16(m_currentSuffixIndex + m_suffixMatchLength);
764 unsigned int nextSuffix;
765
766 do
767 {
768 nextSuffix = m_ISA[lastSuffixIndex = m_currentSuffixIndex];
769 if ((m_currentSuffixIndex = nextSuffix) == END_OF_CHAIN)
770 break;
771 else
772 if (IsTandemRepeat())
773 {
774 PassTandemRepeat();
775 break;
776 }
777 else
778 if (IsSortedByInduction())
779 {
780 m_currentSuffixIndex = m_ISA[nextSuffix];
781 break;
782 }
783 } while (Value16(m_currentSuffixIndex + m_suffixMatchLength) == targetSymbol);
784
785 AddToSuffixChain(firstSuffixIndex, lastSuffixIndex, targetSymbol);
786 }
787 } while (m_currentSuffixIndex != END_OF_CHAIN);
788
789 ProcessSuffixesSortedByInduction();
790 PushNewChainsOntoStack();
791 }
792 }
793
794
795
796
797
798
ProcessSuffixesSortedByInduction()799 inline void MSufSort::ProcessSuffixesSortedByInduction()
800 {
801 unsigned int numSuffixes = m_suffixesSortedByInduction.Count();
802 if (numSuffixes)
803 {
804 InductionSortObject * objects = m_suffixesSortedByInduction.m_stack;
805 if (numSuffixes > 1)
806 IntroSort(objects, numSuffixes);
807
808 if (m_hasTandemRepeatSortedByInduction)
809 {
810 // During the last pass some suffixes which were sorted via induction were also
811 // determined to be the terminal suffix in a tandem repeat. So when we mark
812 // the suffixes as sorted (where were sorted via induction) we make chain together
813 // the preceding suffix in the tandem repeat (if there is one).
814 unsigned int firstTandemRepeatIndex = END_OF_CHAIN;
815 unsigned int lastTandemRepeatIndex = END_OF_CHAIN;
816 unsigned int tandemRepeatLength = m_suffixMatchLength - 1;
817 m_hasTandemRepeatSortedByInduction = false;
818
819 for (unsigned int i = 0; i < numSuffixes; i++)
820 {
821 unsigned int suffixIndex = (objects[i].m_sortValue[1] & 0x3fffffff);
822 if ((suffixIndex >= tandemRepeatLength) && (m_ISA[suffixIndex - tandemRepeatLength] == suffixIndex))
823 {
824 // this suffix was a terminating suffix in a tandem repeat.
825 // add the preceding suffix in the tandem repeat to the list.
826 if (firstTandemRepeatIndex == END_OF_CHAIN)
827 firstTandemRepeatIndex = lastTandemRepeatIndex = (suffixIndex - tandemRepeatLength);
828 else
829 lastTandemRepeatIndex = (m_ISA[lastTandemRepeatIndex] = (suffixIndex - tandemRepeatLength));
830 }
831 MarkSuffixAsSorted(suffixIndex, m_nextSortedSuffixValue);
832 }
833
834 // now process each suffix in the tandem repeat list making each as sorted.
835 // build a new list for tandem repeats which preceded each in the list until there are
836 // no preceding tandem suffix for any suffix in the list.
837
838 while (firstTandemRepeatIndex != END_OF_CHAIN)
839 {
840 m_ISA[lastTandemRepeatIndex] = END_OF_CHAIN;
841 unsigned int suffixIndex = firstTandemRepeatIndex;
842 firstTandemRepeatIndex = END_OF_CHAIN;
843 while (suffixIndex != END_OF_CHAIN)
844 {
845 if ((suffixIndex >= tandemRepeatLength) && (m_ISA[suffixIndex - tandemRepeatLength] == suffixIndex))
846 {
847 // this suffix was a terminating suffix in a tandem repeat.
848 // add the preceding suffix in the tandem repeat to the list.
849 if (firstTandemRepeatIndex == END_OF_CHAIN)
850 firstTandemRepeatIndex = lastTandemRepeatIndex = (suffixIndex - tandemRepeatLength);
851 else
852 lastTandemRepeatIndex = (m_ISA[lastTandemRepeatIndex] = (suffixIndex - tandemRepeatLength));
853 }
854 unsigned int nextSuffix = m_ISA[suffixIndex];
855 MarkSuffixAsSorted(suffixIndex, m_nextSortedSuffixValue);
856 suffixIndex = nextSuffix;
857 }
858 }
859 // finished.
860 }
861 else
862 {
863 // This is the typical branch on the condition. There were no tandem repeats
864 // encountered during the last chain that were terminated with a suffix that
865 // was sorted via induction. In this case we just mark the suffixes as sorted
866 // and we are done.
867 for (unsigned int i = 0; i < numSuffixes; i++)
868 MarkSuffixAsSorted(objects[i].m_sortValue[1] & 0x3fffffff, m_nextSortedSuffixValue);
869 }
870 m_suffixesSortedByInduction.Clear();
871 }
872 }
873
874
875
876
877
ProcessSuffixesSortedByEnhancedInduction(unsigned short suffixId)878 inline void MSufSort::ProcessSuffixesSortedByEnhancedInduction(unsigned short suffixId)
879 {
880 //
881 if (m_firstSuffixByEnhancedInductionSort[suffixId] != END_OF_CHAIN)
882 {
883 unsigned int currentSuffixIndex = m_firstSuffixByEnhancedInductionSort[suffixId];
884 unsigned int lastSuffixIndex = m_lastSuffixByEnhancedInductionSort[suffixId];
885 m_firstSuffixByEnhancedInductionSort[suffixId] = END_OF_CHAIN;
886 m_lastSuffixByEnhancedInductionSort[suffixId] = END_OF_CHAIN;
887
888 do
889 {
890 unsigned short symbol = Value16(currentSuffixIndex);
891 unsigned int nextIndex = m_ISA[currentSuffixIndex];
892 MarkSuffixAsSorted2(currentSuffixIndex, m_firstSortedPosition[symbol]);
893 if (currentSuffixIndex == lastSuffixIndex)
894 {
895 if (m_firstSuffixByEnhancedInductionSort[suffixId] == END_OF_CHAIN)
896 return;
897 currentSuffixIndex = m_firstSuffixByEnhancedInductionSort[suffixId];
898 lastSuffixIndex = m_lastSuffixByEnhancedInductionSort[suffixId];
899 m_firstSuffixByEnhancedInductionSort[suffixId] = END_OF_CHAIN;
900 m_lastSuffixByEnhancedInductionSort[suffixId] = END_OF_CHAIN;
901 }
902 else
903 currentSuffixIndex = nextIndex;
904 } while (true);
905 }
906 }
907
908
909
910
911 #ifdef SHOW_PROGRESS
ShowProgress()912 inline void MSufSort::ShowProgress()
913 {
914 // Update the progress indicator.
915 //double p = ((double)(m_numSortedSuffixes & 0x3fffffff) / m_sourceLength) * 100;
916 // printf("Progress: %.2f%% %c", p, 13);
917 }
918 #endif
919 #endif
920