1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 2.0
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7  *
8  * Software distributed under the License is distributed on an "AS IS" basis,
9  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
10  * for the specific language governing rights and limitations under the
11  * License.
12  *
13  * The Original Code is the MSufSort suffix sorting algorithm (Version 2.2).
14  *
15  * The Initial Developer of the Original Code is
16  * Michael A. Maniscalco
17  * Portions created by the Initial Developer are Copyright (C) 2006
18  * the Initial Developer. All Rights Reserved.
19  *
20  * Contributor(s):
21  *
22  *   Michael A. Maniscalco
23  *
24  * ***** END LICENSE BLOCK ***** */
25 
26 #ifndef MSUFSORT_H
27 #define MSUFSORT_H
28 
29 //==================================================================//
30 //																	//
31 //  v										//
32 // MSufSort	Version 2.2																		//
33 // Author: Michael A Maniscalco										//
34 // Date: Nov. 3, 2005												//
35 //																	//
36 // Notes:															//
37 //																	//
38 //==================================================================//
39 
40 
41 #include "stdio.h"
42 #include "stack.h"
43 #include "introsort.h"
44 #include "inductionsort.h"
45 
46 
47 //==================================================================//
48 // Test app defines:												//
49 //==================================================================//
50 
51 #define SHOW_PROGRESS		        // display progress during sort
52 #define CHECK_SORT		       	// verify that sorting is correct.
53 //  #define SORT_16_BIT_SYMBOLS		// enable 16 bit symbols.
54 
55 #define USE_INDUCTION_SORTING		// enable induction sorting feature.
56 #define USE_ENHANCED_INDUCTION_SORTING	// enable enhanced induction sorting feature.
57 #define USE_TANDEM_REPEAT_SORTING   // enable the tandem repeat sorting feature.
58 //#define USE_ALT_SORT_ORDER	    // enable alternative sorting order
59 
60 
61 #define ENDIAN_SWAP_16(value)	    ((value >> 8) | (value << 8))
62 
63 #define SUFFIX_SORTED		     0x80000000	// flag marks suffix as sorted.
64 #define END_OF_CHAIN		     0x3ffffffe	// marks the end of a chain
65 #define SORTED_BY_ENHANCED_INDUCTION 0x3fffffff	// marks suffix which will be sorted by enhanced induction sort.
66 
67 
68 
69 #ifdef SORT_16_BIT_SYMBOLS
70 	#define SYMBOL_TYPE	     unsigned short
71 #else
72 	#define SYMBOL_TYPE	     unsigned char
73 #endif
74 
75 class MSufSort
76 {
77 public:
78 	MSufSort();
79 
80 	virtual ~MSufSort();
81 
82 	unsigned int Sort(SYMBOL_TYPE * source, unsigned int sourceLength);
83 
84 	unsigned int GetElapsedSortTime();
85 
86 	unsigned int GetMemoryUsage();
87 
88 	unsigned int ISA(unsigned int index);
89 
90 	bool VerifySort();
91 
92 	static void ReverseAltSortOrder(SYMBOL_TYPE * data, unsigned int nBytes);
93 
94 
95 private:
96 	int CompareStrings(SYMBOL_TYPE * stringA, SYMBOL_TYPE * stringB, int len);
97 
98 	bool IsTandemRepeat2();
99 
100 	bool IsTandemRepeat();
101 
102 	void PassTandemRepeat();
103 
104 	bool IsSortedByInduction();
105 
106 	bool IsSortedByEnhancedInduction(unsigned int suffixIndex);
107 
108 	void ProcessSuffixesSortedByInduction();
109 
110 	// MarkSuffixAsSorted
111 	// Sets the final inverse suffix array value for a given suffix.
112 	// Also invokes the OnSortedSuffix member function.
113 	void MarkSuffixAsSorted(unsigned int suffixIndex, unsigned int & sortedIndex);
114 	void MarkSuffixAsSorted2(unsigned int suffixIndex, unsigned int & sortedIndex);
115 
116 	void MarkSuffixAsSortedByEnhancedInductionSort(unsigned int suffixIndex);
117 
118 	// PushNewChainsOntoStack:
119 	// Moves all new suffix chains onto the stack of partially sorted
120 	// suffixes.  (makes them ready for further sub sorting).
121 	void PushNewChainsOntoStack(bool originalChains = false);
122 
123 	void PushTandemBypassesOntoStack();
124 
125 	// OnSortedSuffix:
126 	// Event which is invoked with each sorted suffix at the time of
127 	// its sorting.
128 	virtual void OnSortedSuffix(unsigned int suffixIndex);
129 
130 	// Initialize:
131 	// Initializes this object just before sorting begins.
132 	void Initialize();
133 
134 	// InitialSort:
135 	// This is the first sorting pass which makes the initial suffix
136 	// chains from the given source string.  Pushes these chains onto
137 	// the stack for further sorting.
138 	void InitialSort();
139 
140 	// Value16:
141 	// Returns the two 8 bit symbols located
142 	// at positions N and N + 1 where N = the sourceIndex.
143 	unsigned short Value16(unsigned int sourceIndex);
144 
145 	// ProcessChain:
146 	// Sorts the suffixes of a given chain by the next two symbols of
147 	// each suffix in the chain.  This creates zero or more new suffix
148 	// chains with each sorted by two more symbols than the original
149 	// chain.  Then pushes these new chains onto the chain stack for
150 	// further sorting.
151 	void ProcessNextChain();
152 
153 	void AddToSuffixChain(unsigned int suffixIndex, unsigned short suffixChain);
154 
155 	void AddToSuffixChain(unsigned int firstSuffixIndex, unsigned int lastSuffixIndex, unsigned short suffixChain);
156 
157 	void ProcessSuffixesSortedByEnhancedInduction(unsigned short suffixId);
158 
159 	void ResolveTandemRepeatsNotSortedWithInduction();
160 
161 	unsigned int				m_sortTime;
162 
163 	Stack<unsigned int>			m_chainMatchLengthStack;
164 
165 	Stack<int>				m_chainCountStack;
166 
167 	Stack<unsigned int>			m_chainHeadStack;
168 
169 	unsigned int				m_endOfSuffixChain[0x10000];
170 
171 	unsigned int				m_startOfSuffixChain[0x10000];
172 
173 	// m_source:
174 	// Address of the string to sort.
175 	SYMBOL_TYPE *				m_source;
176 
177 	// m_sourceLength:
178 	// The length of the string pointed to by m_source.
179 	unsigned int				m_sourceLength;
180 
181 	unsigned int				m_sourceLengthMinusOne;
182 
183 	// m_ISA:
184 	// The address of the working space which, when the sort is
185 	// completed, will contain the inverse suffix array for the
186 	// source string.
187 	unsigned int *				m_ISA;
188 
189 	// m_nextSortedSuffixValue:
190 	unsigned int				m_nextSortedSuffixValue;
191 
192 	//
193 	unsigned int				m_numSortedSuffixes;
194 
195 	// m_newChainIds
196 	// Array containing the valid chain numbers in m_newChain array.
197 	unsigned short				m_newChainIds[0x10000];
198 
199 	// m_numNewChains:
200 	// The number of new suffix chain ids stored in m_numChainIds.
201 	unsigned int				m_numNewChains;
202 
203 	Stack<InductionSortObject>	m_suffixesSortedByInduction;
204 
205 	unsigned int				m_suffixMatchLength;
206 
207 	unsigned int				m_currentSuffixIndex;
208 
209 	// m_firstSortedPosition:
210 	// For use with enhanced induction sorting.
211 	unsigned int				m_firstSortedPosition[0x10000];
212 
213 	unsigned int				m_firstSuffixByEnhancedInductionSort[0x10000];
214 
215 	unsigned int				m_lastSuffixByEnhancedInductionSort[0x10000];
216 
217 	unsigned int				m_currentSuffixChainId;
218 
219 	#ifdef SHOW_PROGRESS
220 		// ShowProgress:
221 		// Update the progress indicator.
222 		void ShowProgress();
223 
224 		// m_nextProgressUpdate:
225 		// Indicates when to update the progress indicator.
226 		unsigned int			m_nextProgressUpdate;
227 
228 		// m_progressUpdateIncrement:
229 		// Indicates how many suffixes should be sorted before
230 		// incrementing the progress indicator.
231 		unsigned int			m_progressUpdateIncrement;
232 	#endif
233 
234 
235 	// members used if alternate sorting order should be applied.
236 	SYMBOL_TYPE		m_forwardAltSortOrder[256];
237 
238 	static SYMBOL_TYPE		m_reverseAltSortOrder[256];
239 
240 	// for tandem repeat sorting
241 	bool				m_hasTandemRepeatSortedByInduction;
242 
243 	unsigned int		m_firstUnsortedTandemRepeat;
244 
245 	unsigned int		m_lastUnsortedTandemRepeat;
246 
247 	bool				m_hasEvenLengthTandemRepeats;
248 
249 	unsigned int		m_tandemRepeatDepth;
250 
251 	unsigned int		m_firstSortedTandemRepeat;
252 
253 	unsigned int		m_lastSortedTandemRepeat;
254 
255 	unsigned int		m_tandemRepeatLength;
256 };
257 
258 
259 
260 
261 
262 //inline unsigned short MSufSort::Value16(unsigned int sourceIndex)
263 //{
264 //	return (sourceIndex < m_sourceLengthMinusOne) ? *(unsigned short *)(m_source + sourceIndex) : m_source[sourceIndex];
265 //}
266 
267 // fix by Brian Ripley
Value16(unsigned int sourceIndex)268 inline unsigned short MSufSort::Value16(unsigned int sourceIndex)
269 {
270    union {unsigned short u; unsigned char b[2];} u16;
271    u16.b[0] = m_source[sourceIndex];
272    u16.b[1] = (sourceIndex < m_sourceLengthMinusOne) ?
273 	m_source[sourceIndex + 1] : 0;
274    return u16.u;
275 }
276 
277 
278 
279 
280 
IsSortedByInduction()281 inline bool MSufSort::IsSortedByInduction()
282 {
283 	unsigned int n = m_currentSuffixIndex + m_suffixMatchLength - 1;
284 
285 	#ifndef USE_INDUCTION_SORTING
286 		if (n < m_sourceLengthMinusOne)
287 			return false;
288 	#endif
289 
290 	if ((m_ISA[n] & SUFFIX_SORTED) && ((m_ISA[n] & 0x3fffffff) < m_nextSortedSuffixValue))
291 	{
292 		InductionSortObject i(0, m_ISA[n], m_currentSuffixIndex);
293 		m_suffixesSortedByInduction.Push(i);
294 	}
295 	else
296 		if ((m_ISA[n + 1] & SUFFIX_SORTED) && ((m_ISA[n + 1] & 0x3fffffff) < m_nextSortedSuffixValue))
297 		{
298 			InductionSortObject i(1, m_ISA[n + 1], m_currentSuffixIndex);
299 			m_suffixesSortedByInduction.Push(i);
300 		}
301 		else
302 			return false;
303 
304 	return true;
305 }
306 
307 
308 
309 
310 
311 
312 
313 
IsSortedByEnhancedInduction(unsigned int suffixIndex)314 inline bool MSufSort::IsSortedByEnhancedInduction(unsigned int suffixIndex)
315 {
316 	if (suffixIndex > 0)
317 		if (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION)
318 			return true;
319 	return false;
320 }
321 
322 
323 
324 
325 
326 
IsTandemRepeat()327 inline bool MSufSort::IsTandemRepeat()
328 {
329 	#ifndef USE_TANDEM_REPEAT_SORTING
330 		return false;
331 	#else
332 		if ((!m_tandemRepeatDepth) && (m_currentSuffixIndex + m_suffixMatchLength) == (m_ISA[m_currentSuffixIndex] + 1))
333 			return true;
334 
335 		#ifndef SORT_16_BIT_SYMBOLS
336 			if ((!m_tandemRepeatDepth) && ((m_currentSuffixIndex + m_suffixMatchLength) == (m_ISA[m_currentSuffixIndex])))
337 			{
338 				m_hasEvenLengthTandemRepeats = true;
339 				return false;
340 			}
341 		#endif
342 
343 		return false;
344 	#endif
345 }
346 
347 
348 
349 
350 
351 
352 
PassTandemRepeat()353 inline void MSufSort::PassTandemRepeat()
354 {
355 	unsigned int nextIndex;
356 	unsigned int lastIndex;
357 	// unsigned int firstIndex = m_currentSuffixIndex;
358 
359 	while ((m_currentSuffixIndex + m_suffixMatchLength) == ((nextIndex = m_ISA[m_currentSuffixIndex]) + 1))
360 	{
361 		lastIndex = m_currentSuffixIndex;
362 		m_currentSuffixIndex = nextIndex;
363 	}
364 
365 	if (IsSortedByInduction())
366 	{
367 		m_hasTandemRepeatSortedByInduction = true;
368 		m_currentSuffixIndex = m_ISA[m_currentSuffixIndex];
369 	}
370 	else
371 	{
372 		if (m_firstUnsortedTandemRepeat == END_OF_CHAIN)
373 			m_firstUnsortedTandemRepeat = m_lastUnsortedTandemRepeat = lastIndex;
374 		else
375 			m_lastUnsortedTandemRepeat = (m_ISA[m_lastUnsortedTandemRepeat] = lastIndex);
376 	}
377 }
378 
379 
380 
381 
382 
383 
PushNewChainsOntoStack(bool originalChains)384 inline void MSufSort::PushNewChainsOntoStack(bool originalChains)
385 {
386 	// Moves all new suffix chains onto the stack of partially sorted
387 	// suffixes.  (makes them ready for further sub sorting).
388 	#ifdef SORT_16_BIT_SYMBOLS
389 		unsigned int newSuffixMatchLength = m_suffixMatchLength + 1;
390 	#else
391 		unsigned int newSuffixMatchLength = m_suffixMatchLength + 2;
392 	#endif
393 
394 	if (m_numNewChains)
395 	{
396 		if (m_hasEvenLengthTandemRepeats)
397 		{
398 			m_chainCountStack.Push(m_numNewChains - 1);
399 			m_chainMatchLengthStack.Push(newSuffixMatchLength);
400 			m_chainCountStack.Push(1);
401 			m_chainMatchLengthStack.Push(newSuffixMatchLength - 1);
402 		}
403 		else
404 		{
405 			m_chainCountStack.Push(m_numNewChains);
406 			m_chainMatchLengthStack.Push(newSuffixMatchLength);
407 		}
408 
409 		if (m_numNewChains > 1)
410 			IntroSort(m_newChainIds, m_numNewChains);
411 
412 		while (m_numNewChains)
413 		{
414 			unsigned short chainId = m_newChainIds[--m_numNewChains];
415 			chainId = ENDIAN_SWAP_16(chainId);
416 			// unsigned int n = m_startOfSuffixChain[chainId];
417 			m_chainHeadStack.Push(m_startOfSuffixChain[chainId]);
418 			m_startOfSuffixChain[chainId] = END_OF_CHAIN;
419 			m_ISA[m_endOfSuffixChain[chainId]] = END_OF_CHAIN;
420 		}
421 	}
422 	m_hasEvenLengthTandemRepeats = false;
423 
424 	if (m_firstUnsortedTandemRepeat != END_OF_CHAIN)
425 	{
426 		// Tandem repeats with a terminating suffix that did not get
427 		// sorted via induction has occurred (at least once).
428 		// We have a suffix chain (indicated by m_firstTandemRepeatWithoutSuffix)
429 		// of the suffix in each tandem repeat which immediately proceeded the
430 		// terminating suffix in each chain.  We want to sort them relative to
431 		// each other and then process the tandem repeats.
432 		unsigned int tandemRepeatLength = m_suffixMatchLength - 1;
433 		unsigned int numChains = m_chainHeadStack.Count();
434 		m_chainHeadStack.Push(m_firstUnsortedTandemRepeat);
435 		m_chainCountStack.Push(1);
436 		m_chainMatchLengthStack.Push((m_suffixMatchLength << 1) - 1);
437 		m_ISA[m_lastUnsortedTandemRepeat] = END_OF_CHAIN;
438 		m_firstUnsortedTandemRepeat = END_OF_CHAIN;
439 
440 		m_tandemRepeatDepth = 1;
441 		while (m_chainHeadStack.Count() > numChains)
442 			ProcessNextChain();
443 
444 		m_suffixMatchLength = tandemRepeatLength + 1;
445 		ResolveTandemRepeatsNotSortedWithInduction();
446 		m_tandemRepeatDepth = 0;
447 	}
448 
449 }
450 
451 
452 
453 
454 
455 
AddToSuffixChain(unsigned int suffixIndex,unsigned short suffixChain)456 inline void MSufSort::AddToSuffixChain(unsigned int suffixIndex, unsigned short suffixChain)
457 {
458 	if (m_startOfSuffixChain[suffixChain] == END_OF_CHAIN)
459 	{
460 		m_endOfSuffixChain[suffixChain] = m_startOfSuffixChain[suffixChain] = suffixIndex;
461 		m_newChainIds[m_numNewChains++] = ENDIAN_SWAP_16(suffixChain);
462 	}
463 	else
464 		m_endOfSuffixChain[suffixChain] = m_ISA[m_endOfSuffixChain[suffixChain]] = suffixIndex;
465 }
466 
467 
468 
469 
470 
471 
AddToSuffixChain(unsigned int firstSuffixIndex,unsigned int lastSuffixIndex,unsigned short suffixChain)472 inline void MSufSort::AddToSuffixChain(unsigned int firstSuffixIndex, unsigned int lastSuffixIndex, unsigned short suffixChain)
473 {
474 	if (m_startOfSuffixChain[suffixChain] == END_OF_CHAIN)
475 	{
476 		m_startOfSuffixChain[suffixChain] = firstSuffixIndex;
477 		m_endOfSuffixChain[suffixChain] = lastSuffixIndex;
478 		m_newChainIds[m_numNewChains++] = ENDIAN_SWAP_16(suffixChain);
479 	}
480 	else
481 	{
482 		m_ISA[m_endOfSuffixChain[suffixChain]] = firstSuffixIndex;
483 		m_endOfSuffixChain[suffixChain] = lastSuffixIndex;
484 	}
485 }
486 
487 
488 
489 
490 
491 
492 
OnSortedSuffix(unsigned int suffixIndex)493 inline void MSufSort::OnSortedSuffix(unsigned int suffixIndex)
494 {
495 	// Event which is invoked with each sorted suffix at the time of
496 	// its sorting.
497 	m_numSortedSuffixes++;
498 	#ifdef SHOW_PROGRESS
499 		if (m_numSortedSuffixes >= m_nextProgressUpdate)
500 		{
501 			m_nextProgressUpdate += m_progressUpdateIncrement;
502 			ShowProgress();
503 		}
504 	#endif
505 }
506 
507 
508 
509 #ifdef SORT_16_BIT_SYMBOLS
510 
MarkSuffixAsSorted(unsigned int suffixIndex,unsigned int & sortedIndex)511 inline void MSufSort::MarkSuffixAsSorted(unsigned int suffixIndex, unsigned int  & sortedIndex)
512 {
513 	// Sets the final inverse suffix array value for a given suffix.
514 	// Also invokes the OnSortedSuffix member function.
515 
516 	if (m_tandemRepeatDepth)
517 	{
518 		// we are processing a list of suffixes which we the second to last in tandem repeats
519 		// that were not terminated via induction.  These suffixes are not actually to be
520 		// marked as sorted yet.  Instead, they are to be linked together in sorted order.
521 		if (m_firstSortedTandemRepeat == END_OF_CHAIN)
522 			m_firstSortedTandemRepeat = m_lastSortedTandemRepeat = suffixIndex;
523 		else
524 			m_lastSortedTandemRepeat = (m_ISA[m_lastSortedTandemRepeat] = suffixIndex);
525 		return;
526 	}
527 
528 	m_ISA[suffixIndex] = (sortedIndex++ | SUFFIX_SORTED);
529 	#ifdef SHOW_PROGRESS
530 		OnSortedSuffix(suffixIndex);
531 	#endif
532 
533 	#ifdef USE_ENHANCED_INDUCTION_SORTING
534 		if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
535 		{
536 			suffixIndex--;
537 			unsigned short symbol = Value16(suffixIndex);
538 
539 			m_ISA[suffixIndex] = (m_firstSortedPosition[symbol]++ | SUFFIX_SORTED);
540 			#ifdef SHOW_PROGRESS
541 				OnSortedSuffix(suffixIndex);
542 			#endif
543 
544 			if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
545 			{
546 				suffixIndex--;
547 				symbol = ENDIAN_SWAP_16(symbol);
548 				if (m_firstSuffixByEnhancedInductionSort[symbol] == END_OF_CHAIN)
549 					m_firstSuffixByEnhancedInductionSort[symbol] = m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
550 				else
551 				{
552 					m_ISA[m_lastSuffixByEnhancedInductionSort[symbol]] = suffixIndex;
553 					m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
554 				}
555 			}
556 		}
557 	#endif
558 }
559 
560 
561 
562 
MarkSuffixAsSorted2(unsigned int suffixIndex,unsigned int & sortedIndex)563 inline void MSufSort::MarkSuffixAsSorted2(unsigned int suffixIndex, unsigned int  & sortedIndex)
564 {
565 	// Sets the final inverse suffix array value for a given suffix.
566 	// Also invokes the OnSortedSuffix member function.
567 
568 	if (m_tandemRepeatDepth)
569 	{
570 		// we are processing a list of suffixes which we the second to last in tandem repeats
571 		// that were not terminated via induction.  These suffixes are not actually to be
572 		// marked as sorted yet.  Instead, they are to be linked together in sorted order.
573 		if (m_firstSortedTandemRepeat == END_OF_CHAIN)
574 			m_firstSortedTandemRepeat = m_lastSortedTandemRepeat = suffixIndex;
575 		else
576 			m_lastSortedTandemRepeat = (m_ISA[m_lastSortedTandemRepeat] = suffixIndex);
577 		return;
578 	}
579 
580 	m_ISA[suffixIndex] = (sortedIndex++ | SUFFIX_SORTED);
581 	#ifdef SHOW_PROGRESS
582 		OnSortedSuffix(suffixIndex);
583 	#endif
584 
585 	#ifdef USE_ENHANCED_INDUCTION_SORTING
586 	if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
587 	{
588 		unsigned short symbol = Value16(suffixIndex);
589 		symbol = ENDIAN_SWAP_16(symbol);
590 		suffixIndex--;
591 		if (m_firstSuffixByEnhancedInductionSort[symbol] == END_OF_CHAIN)
592 			m_firstSuffixByEnhancedInductionSort[symbol] = m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
593 		else
594 		{
595 			m_ISA[m_lastSuffixByEnhancedInductionSort[symbol]] = suffixIndex;
596 			m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
597 		}
598 	}
599 	#endif
600 }
601 
602 
603 #else
604 
605 
MarkSuffixAsSorted(unsigned int suffixIndex,unsigned int & sortedIndex)606 inline void MSufSort::MarkSuffixAsSorted(unsigned int suffixIndex, unsigned int  & sortedIndex)
607 {
608 	// Sets the final inverse suffix array value for a given suffix.
609 	// Also invokes the OnSortedSuffix member function.
610 
611 	if (m_tandemRepeatDepth)
612 	{
613 		// we are processing a list of suffixes which we the second to last in tandem repeats
614 		// that were not terminated via induction.  These suffixes are not actually to be
615 		// marked as sorted yet.  Instead, they are to be linked together in sorted order.
616 		if (m_firstSortedTandemRepeat == END_OF_CHAIN)
617 			m_firstSortedTandemRepeat = m_lastSortedTandemRepeat = suffixIndex;
618 		else
619 			m_lastSortedTandemRepeat = (m_ISA[m_lastSortedTandemRepeat] = suffixIndex);
620 		return;
621 	}
622 
623 	m_ISA[suffixIndex] = (sortedIndex++ | SUFFIX_SORTED);
624 	#ifdef SHOW_PROGRESS
625 		OnSortedSuffix(suffixIndex);
626 	#endif
627 
628 	#ifdef USE_ENHANCED_INDUCTION_SORTING
629 		if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
630 		{
631 			suffixIndex--;
632 			unsigned short symbol = Value16(suffixIndex);
633 
634 			m_ISA[suffixIndex] = (m_firstSortedPosition[symbol]++ | SUFFIX_SORTED);
635 			#ifdef SHOW_PROGRESS
636 				OnSortedSuffix(suffixIndex);
637 			#endif
638 
639 		if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
640 		{
641 			suffixIndex--;
642 			unsigned short symbol2 = symbol;
643 			symbol = Value16(suffixIndex);
644 
645 			m_ISA[suffixIndex] = (m_firstSortedPosition[symbol]++ | SUFFIX_SORTED);
646 			#ifdef SHOW_PROGRESS
647 				OnSortedSuffix(suffixIndex);
648 			#endif
649 
650 			if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
651 			{
652 				if (m_source[suffixIndex] < m_source[suffixIndex + 1])
653 					symbol2 = ENDIAN_SWAP_16(symbol);
654 				else
655 					symbol2 = ENDIAN_SWAP_16(symbol2);
656 				suffixIndex--;
657 				if (m_firstSuffixByEnhancedInductionSort[symbol2] == END_OF_CHAIN)
658 					m_firstSuffixByEnhancedInductionSort[symbol2] = m_lastSuffixByEnhancedInductionSort[symbol2] = suffixIndex;
659 				else
660 				{
661 					m_ISA[m_lastSuffixByEnhancedInductionSort[symbol2]] = suffixIndex;
662 					m_lastSuffixByEnhancedInductionSort[symbol2] = suffixIndex;
663 				}
664 			}
665 		}
666 		}
667 	#endif
668 }
669 
670 
671 
672 
MarkSuffixAsSorted2(unsigned int suffixIndex,unsigned int & sortedIndex)673 inline void MSufSort::MarkSuffixAsSorted2(unsigned int suffixIndex, unsigned int  & sortedIndex)
674 {
675 	// Sets the final inverse suffix array value for a given suffix.
676 	// Also invokes the OnSortedSuffix member function.
677 
678 	if (m_tandemRepeatDepth)
679 	{
680 		// we are processing a list of suffixes which we the second to last in tandem repeats
681 		// that were not terminated via induction.  These suffixes are not actually to be
682 		// marked as sorted yet.  Instead, they are to be linked together in sorted order.
683 		if (m_firstSortedTandemRepeat == END_OF_CHAIN)
684 			m_firstSortedTandemRepeat = m_lastSortedTandemRepeat = suffixIndex;
685 		else
686 			m_lastSortedTandemRepeat = (m_ISA[m_lastSortedTandemRepeat] = suffixIndex);
687 		return;
688 	}
689 
690 	m_ISA[suffixIndex] = (sortedIndex++ | SUFFIX_SORTED);
691 	#ifdef SHOW_PROGRESS
692 		OnSortedSuffix(suffixIndex);
693 	#endif
694 
695 	#ifdef USE_ENHANCED_INDUCTION_SORTING
696 	if ((suffixIndex) && (m_ISA[suffixIndex - 1] == SORTED_BY_ENHANCED_INDUCTION))
697 	{
698 		unsigned short symbol;
699 		if (m_source[suffixIndex] < m_source[suffixIndex + 1])
700 			symbol = Value16(suffixIndex);
701 		else
702 			symbol = Value16(suffixIndex + 1);
703 		symbol = ENDIAN_SWAP_16(symbol);
704 		suffixIndex--;
705 		if (m_firstSuffixByEnhancedInductionSort[symbol] == END_OF_CHAIN)
706 			m_firstSuffixByEnhancedInductionSort[symbol] = m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
707 		else
708 		{
709 			m_ISA[m_lastSuffixByEnhancedInductionSort[symbol]] = suffixIndex;
710 			m_lastSuffixByEnhancedInductionSort[symbol] = suffixIndex;
711 		}
712 	}
713 	#endif
714 }
715 
716 #endif
717 
718 
ProcessNextChain()719 inline void MSufSort::ProcessNextChain()
720 {
721 	// Sorts the suffixes of a given chain by the next two symbols of
722 	// each suffix in the chain.  This creates zero or more new suffix
723 	// chains with each sorted by two more symbols than the original
724 	// chain.  Then pushes these new chains onto the chain stack for
725 	// further sorting.
726 	while (--m_chainCountStack.Top() < 0)
727 	{
728 		m_chainCountStack.Pop();
729 		m_chainMatchLengthStack.Pop();
730 	}
731 	m_suffixMatchLength = m_chainMatchLengthStack.Top();
732 	m_currentSuffixIndex = m_chainHeadStack.Pop();
733 
734 	#ifdef USE_ENHANCED_INDUCTION_SORTING
735 	if (m_chainMatchLengthStack.Count() == 1)
736 	{
737 		// one of the original buckets from InitialSort().  This is important
738 		// when enhanced induction sorting is enabled.
739 		unsigned short chainId = Value16(m_currentSuffixIndex);
740 		unsigned short temp = chainId;
741 		chainId = ENDIAN_SWAP_16(chainId);
742 		while (m_currentSuffixChainId <= chainId)
743 			ProcessSuffixesSortedByEnhancedInduction(m_currentSuffixChainId++);
744 		m_nextSortedSuffixValue = m_firstSortedPosition[temp];
745 	}
746 	#endif
747 
748 	if (m_ISA[m_currentSuffixIndex] == END_OF_CHAIN)
749 		MarkSuffixAsSorted(m_currentSuffixIndex, m_nextSortedSuffixValue);  // only one suffix in bucket so it is sorted.
750 	else
751 	{
752 		do
753 		{
754 			if (IsTandemRepeat())
755 				PassTandemRepeat();
756 			else
757 				if ((m_currentSuffixIndex != END_OF_CHAIN) && (IsSortedByInduction()))
758 					m_currentSuffixIndex = m_ISA[m_currentSuffixIndex];
759 				else
760 					{
761 						unsigned int firstSuffixIndex = m_currentSuffixIndex;
762 						unsigned int lastSuffixIndex = m_currentSuffixIndex;
763 						unsigned short targetSymbol = Value16(m_currentSuffixIndex + m_suffixMatchLength);
764 						unsigned int nextSuffix;
765 
766 						do
767 						{
768 							nextSuffix = m_ISA[lastSuffixIndex = m_currentSuffixIndex];
769 							if ((m_currentSuffixIndex = nextSuffix) == END_OF_CHAIN)
770 								break;
771 							else
772 								if (IsTandemRepeat())
773 								{
774 									PassTandemRepeat();
775 									break;
776 								}
777 								else
778 									if (IsSortedByInduction())
779 									{
780 										m_currentSuffixIndex = m_ISA[nextSuffix];
781 										break;
782 									}
783 						} while (Value16(m_currentSuffixIndex + m_suffixMatchLength) == targetSymbol);
784 
785 					AddToSuffixChain(firstSuffixIndex, lastSuffixIndex, targetSymbol);
786 				}
787 		} while (m_currentSuffixIndex != END_OF_CHAIN);
788 
789 		ProcessSuffixesSortedByInduction();
790 		PushNewChainsOntoStack();
791 	}
792 }
793 
794 
795 
796 
797 
798 
ProcessSuffixesSortedByInduction()799 inline void MSufSort::ProcessSuffixesSortedByInduction()
800 {
801 	unsigned int numSuffixes = m_suffixesSortedByInduction.Count();
802 	if (numSuffixes)
803 	{
804 		InductionSortObject * objects = m_suffixesSortedByInduction.m_stack;
805 		if (numSuffixes > 1)
806 			IntroSort(objects, numSuffixes);
807 
808 		if (m_hasTandemRepeatSortedByInduction)
809 		{
810 			// During the last pass some suffixes which were sorted via induction were also
811 			// determined to be the terminal suffix in a tandem repeat.  So when we mark
812 			// the suffixes as sorted (where were sorted via induction) we make chain together
813 			// the preceding suffix in the tandem repeat (if there is one).
814 			unsigned int firstTandemRepeatIndex = END_OF_CHAIN;
815 			unsigned int lastTandemRepeatIndex = END_OF_CHAIN;
816 			unsigned int tandemRepeatLength = m_suffixMatchLength - 1;
817 			m_hasTandemRepeatSortedByInduction = false;
818 
819 			for (unsigned int i = 0; i < numSuffixes; i++)
820 			{
821 				unsigned int suffixIndex = (objects[i].m_sortValue[1] & 0x3fffffff);
822 				if ((suffixIndex >= tandemRepeatLength) && (m_ISA[suffixIndex - tandemRepeatLength] == suffixIndex))
823 				{
824 					// this suffix was a terminating suffix in a tandem repeat.
825 					// add the preceding suffix in the tandem repeat to the list.
826 					if (firstTandemRepeatIndex == END_OF_CHAIN)
827 						firstTandemRepeatIndex = lastTandemRepeatIndex = (suffixIndex - tandemRepeatLength);
828 					else
829 						lastTandemRepeatIndex = (m_ISA[lastTandemRepeatIndex] = (suffixIndex - tandemRepeatLength));
830 				}
831 				MarkSuffixAsSorted(suffixIndex, m_nextSortedSuffixValue);
832 			}
833 
834 			// now process each suffix in the tandem repeat list making each as sorted.
835 			// build a new list for tandem repeats which preceded each in the list until there are
836 			// no preceding tandem suffix for any suffix in the list.
837 
838 			while (firstTandemRepeatIndex != END_OF_CHAIN)
839 			{
840 				m_ISA[lastTandemRepeatIndex] = END_OF_CHAIN;
841 				unsigned int suffixIndex = firstTandemRepeatIndex;
842 				firstTandemRepeatIndex = END_OF_CHAIN;
843 				while (suffixIndex != END_OF_CHAIN)
844 				{
845 					if ((suffixIndex >= tandemRepeatLength) && (m_ISA[suffixIndex - tandemRepeatLength]  == suffixIndex))
846 					{
847 						// this suffix was a terminating suffix in a tandem repeat.
848 						// add the preceding suffix in the tandem repeat to the list.
849 						if (firstTandemRepeatIndex == END_OF_CHAIN)
850 							firstTandemRepeatIndex = lastTandemRepeatIndex = (suffixIndex - tandemRepeatLength);
851 						else
852 							lastTandemRepeatIndex = (m_ISA[lastTandemRepeatIndex] = (suffixIndex - tandemRepeatLength));
853 					}
854 					unsigned int nextSuffix = m_ISA[suffixIndex];
855 					MarkSuffixAsSorted(suffixIndex, m_nextSortedSuffixValue);
856 					suffixIndex = nextSuffix;
857 				}
858 			}
859 			// finished.
860 		}
861 		else
862 		{
863 			// This is the typical branch on the condition.  There were no tandem repeats
864 			// encountered during the last chain that were terminated with a suffix that
865 			// was sorted via induction.  In this case we just mark the suffixes as sorted
866 			// and we are done.
867 			for (unsigned int i = 0; i < numSuffixes; i++)
868 				MarkSuffixAsSorted(objects[i].m_sortValue[1] & 0x3fffffff, m_nextSortedSuffixValue);
869 		}
870 		m_suffixesSortedByInduction.Clear();
871 	}
872 }
873 
874 
875 
876 
877 
ProcessSuffixesSortedByEnhancedInduction(unsigned short suffixId)878 inline void MSufSort::ProcessSuffixesSortedByEnhancedInduction(unsigned short suffixId)
879 {
880 	//
881 	if (m_firstSuffixByEnhancedInductionSort[suffixId] != END_OF_CHAIN)
882 	{
883 		unsigned int currentSuffixIndex = m_firstSuffixByEnhancedInductionSort[suffixId];
884 		unsigned int lastSuffixIndex = m_lastSuffixByEnhancedInductionSort[suffixId];
885 		m_firstSuffixByEnhancedInductionSort[suffixId] = END_OF_CHAIN;
886 		m_lastSuffixByEnhancedInductionSort[suffixId] = END_OF_CHAIN;
887 
888 		do
889 		{
890 			unsigned short symbol = Value16(currentSuffixIndex);
891 			unsigned int nextIndex = m_ISA[currentSuffixIndex];
892 			MarkSuffixAsSorted2(currentSuffixIndex, m_firstSortedPosition[symbol]);
893 			if (currentSuffixIndex == lastSuffixIndex)
894 			{
895 				if (m_firstSuffixByEnhancedInductionSort[suffixId] == END_OF_CHAIN)
896 					return;
897 				currentSuffixIndex = m_firstSuffixByEnhancedInductionSort[suffixId];
898 				lastSuffixIndex = m_lastSuffixByEnhancedInductionSort[suffixId];
899 				m_firstSuffixByEnhancedInductionSort[suffixId] = END_OF_CHAIN;
900 				m_lastSuffixByEnhancedInductionSort[suffixId] = END_OF_CHAIN;
901 			}
902 			else
903 				currentSuffixIndex = nextIndex;
904 		} while (true);
905 	}
906 }
907 
908 
909 
910 
911 #ifdef SHOW_PROGRESS
ShowProgress()912 inline void MSufSort::ShowProgress()
913 {
914 	// Update the progress indicator.
915 	//double p = ((double)(m_numSortedSuffixes & 0x3fffffff) / m_sourceLength) * 100;
916 //	printf("Progress: %.2f%% %c", p, 13);
917 }
918 #endif
919 #endif
920