1
2 #include "UtilStr.h"
3
4
5 #include "CEgIStream.h"
6 #include "CEgOStream.h"
7
8 #include <math.h>
9
10 #include <string.h>
11
12
UtilStr()13 UtilStr::UtilStr() {
14
15 init();
16 }
17
18
19
UtilStr(const char * inCStr)20 UtilStr::UtilStr( const char* inCStr ) {
21 init();
22 Append( inCStr );
23 }
24
25
26
27
UtilStr(const unsigned char * inStrPtr)28 UtilStr::UtilStr( const unsigned char* inStrPtr ) {
29 init();
30 Append( inStrPtr );
31 }
32
33
34
UtilStr(const UtilStr & inStr)35 UtilStr::UtilStr( const UtilStr& inStr ) {
36 init();
37 Append( &inStr );
38 }
39
40
41
UtilStr(const UtilStr * inStr)42 UtilStr::UtilStr( const UtilStr* inStr ) {
43 init();
44
45 Append( inStr );
46 }
47
48
49
UtilStr(long inNum)50 UtilStr::UtilStr( long inNum ) {
51 init();
52 Append( inNum );
53 }
54
55
UtilStr(const void * inPtr,long bytes)56 UtilStr::UtilStr( const void* inPtr, long bytes ) {
57
58 init();
59 Append( inPtr, bytes );
60 }
61
62
63
64 #if 0
65 #include <stdio.h>
66 #endif
~UtilStr()67 UtilStr::~UtilStr() {
68 #if 0
69 fprintf(stderr, "~UtilStr()->(this = %.8X mBuf = %.8X)\n", this, mBuf);
70 #endif
71 if ( mBuf )
72 delete mBuf;
73 #if 0
74 fprintf(stderr, "<-~UtilStr()\n");
75 #endif
76 }
77
78
79
init()80 void UtilStr::init() {
81
82 mStrLen = 0;
83 mBufSize = 0;
84 mBuf = 0;
85 }
86
87
88
Swap(UtilStr & ioStr)89 void UtilStr::Swap( UtilStr& ioStr ) {
90 long len = mStrLen, size = mBufSize;
91 char* buf = mBuf;
92
93 mBuf = ioStr.mBuf;
94 mStrLen = ioStr.mStrLen;
95 mBufSize = ioStr.mBufSize;
96
97 ioStr.mBuf = buf;
98 ioStr.mStrLen = len;
99 ioStr.mBufSize = size;
100 }
101
102
103
getPasStr() const104 unsigned char* UtilStr::getPasStr() const {
105
106 if ( ! mBuf )
107 return (unsigned char*) "\0";
108
109 if ( mStrLen < 255 )
110 mBuf[0] = mStrLen;
111 else
112 mBuf[0] = 255;
113
114 return (unsigned char*) &mBuf[0];
115 }
116
117
118
119
getCStr() const120 char* UtilStr::getCStr() const {
121
122 if ( mBuf ) {
123 mBuf[ mStrLen + 1 ] = '\0';
124 return &mBuf[1]; }
125 else
126 return "\0";
127
128 }
129
130
131
132
133
134
135
Append(const char * inCStr)136 void UtilStr::Append( const char* inCStr ) {
137 register unsigned long i = 0;
138
139 if ( inCStr ) {
140 while( inCStr[ i ] != '\0' )
141 i++;
142 Append( inCStr, i );
143 }
144 }
145
146
147
Append(const unsigned char * inStrPtr)148 void UtilStr::Append( const unsigned char* inStrPtr ) {
149
150 if ( inStrPtr )
151 Append( (char*) &inStrPtr[1], inStrPtr[0] );
152
153 }
154
155
156
Append(const void * inSrce,long numBytes)157 void UtilStr::Append( const void* inSrce, long numBytes ) {
158 unsigned long newLen = numBytes + mStrLen;
159 char* oldBuf;
160
161 if ( numBytes > 0 ) {
162 if ( newLen >= mBufSize ) {
163 if ( newLen < 80 )
164 mBufSize = newLen + 5;
165 else if ( newLen < 500 )
166 mBufSize = newLen + 100;
167 else mBufSize = newLen + 3000;
168
169 oldBuf = mBuf;
170 mBuf = new char[ mBufSize + 2 ]; // One for pascal byte, one for c NUL byte
171 if ( oldBuf ) {
172 if ( mStrLen > 0 )
173 Move( &mBuf[1], &oldBuf[1], mStrLen );
174
175 delete oldBuf;
176 }
177 }
178
179 if ( inSrce && numBytes > 0 )
180 Move( &mBuf[mStrLen + 1], inSrce, numBytes );
181
182 mStrLen = newLen;
183 }
184 }
185
186
187
188
189
Append(long inNum)190 void UtilStr::Append( long inNum ) {
191 UtilStr temp;
192 unsigned long i;
193
194 if ( inNum < 0 ) {
195 Append( '-' );
196 inNum = - inNum;
197 }
198
199 if ( inNum == 0 )
200 Append( '0' );
201
202 while ( inNum > 0 ) {
203 temp.Append( (char) ('0' + inNum % 10) );
204 inNum = inNum / 10;
205 }
206
207 for ( i = temp.length(); i > 0; i-- )
208 Append( temp.getChar( i ) );
209 }
210
211
212
213
214
215
216
Append(const UtilStr * inStr)217 void UtilStr::Append( const UtilStr* inStr ) {
218
219 if ( inStr )
220 Append( inStr -> getCStr(), inStr -> length() );
221 }
222
223
224
225
226
227
Assign(const UtilStr & inStr)228 void UtilStr::Assign( const UtilStr& inStr ) {
229
230 if ( this != &inStr ) {
231 Wipe();
232 Append( inStr.getCStr(), inStr.length() );
233 }
234 }
235
236
237
Assign(const void * inPtr,long bytes)238 void UtilStr::Assign( const void* inPtr, long bytes ) {
239 Wipe();
240 Append( inPtr, bytes );
241 }
242
243
244
Assign(char inChar)245 void UtilStr::Assign( char inChar ) {
246 Wipe();
247 Append( &inChar, 1 );
248 }
249
250
251
Assign(long inNum)252 void UtilStr::Assign( long inNum ) {
253
254 Wipe();
255 Append( inNum );
256 }
257
258
259
260
261
262
263
264
Assign(const unsigned char * inStrPtr)265 void UtilStr::Assign( const unsigned char* inStrPtr ) {
266 Wipe();
267 Append( inStrPtr );
268 }
269
270
271
272
273
Assign(const UtilStr * inStr)274 void UtilStr::Assign( const UtilStr* inStr ) {
275
276
277 if ( inStr != this ) {
278 Wipe();
279 if ( inStr )
280 Append( inStr -> getCStr(), inStr -> length() );
281 }
282 }
283
284
285
Assign(CEgIStream & inStream,long numBytes)286 void UtilStr::Assign( CEgIStream& inStream, long numBytes ) {
287
288 if ( numBytes > 5000000 ) // *** Safe to say that sizes over this are corrupt?
289 inStream.throwErr( cCorrupted );
290 else if ( numBytes > 0 ) {
291 Dim( numBytes );
292 if ( length() < numBytes )
293 numBytes = length();
294 inStream.GetBlock( getCStr(), numBytes );
295 }
296 }
297
298
299
300
301
302
ReadFrom(CEgIStream * inStream)303 void UtilStr::ReadFrom( CEgIStream* inStream ) {
304 unsigned long len = inStream -> GetLong(); // Grab the length
305
306 Assign( *inStream, len );
307 }
308
309
310
311
WriteTo(CEgOStream * inStream) const312 void UtilStr::WriteTo( CEgOStream* inStream ) const {
313
314 inStream -> PutLong( length() );
315 inStream -> PutBlock( getCStr(), length() );
316 }
317
318
319
320
321
322
Prepend(char inChar)323 void UtilStr::Prepend( char inChar ) {
324
325 Insert( 0, &inChar, 1 );
326 }
327
328
Prepend(UtilStr & inStr)329 void UtilStr::Prepend( UtilStr& inStr ) {
330
331 Insert( 0, inStr.getCStr(), inStr.length() );
332 }
333
334
335
Prepend(const char * inStr)336 void UtilStr::Prepend( const char* inStr ) {
337 long len = 0;
338
339 while ( *(char*)(inStr + len) != 0 )
340 len++;
341
342 Insert( 0, inStr, len );
343 }
344
345
Insert(unsigned long inPos,char inChar,long inNumTimes)346 void UtilStr::Insert( unsigned long inPos, char inChar, long inNumTimes ) {
347 unsigned long oldLen = length();
348 unsigned long numAddable;
349
350 if ( inPos > oldLen )
351 inPos = oldLen;
352
353 Insert( inPos, (char*) 0, inNumTimes );
354 numAddable = length() - oldLen;
355 if ( numAddable > 0 && mBuf ) {
356 while ( inNumTimes > 0 ) {
357 mBuf[ inPos + inNumTimes ] = inChar;
358 inNumTimes--;
359 }
360 }
361 }
362
363
Insert(unsigned long inPos,long inNum)364 void UtilStr::Insert( unsigned long inPos, long inNum ) {
365 UtilStr numStr( inNum );
366
367 Insert( inPos, numStr );
368
369 }
370
Insert(unsigned long inPos,const UtilStr & inStr)371 void UtilStr::Insert( unsigned long inPos, const UtilStr& inStr ) {
372 Insert( inPos, inStr.getCStr(), inStr.length() );
373 }
374
375
Insert(unsigned long inPos,const char * inSrce,long inBytes)376 void UtilStr::Insert( unsigned long inPos, const char* inSrce, long inBytes ) {
377 unsigned long numToMove, len = length();
378
379 if ( inPos >= len )
380 Append( inSrce, inBytes );
381 else if ( inBytes > 0 ) {
382 Append( 0, inBytes );
383 numToMove = len - inPos;
384 if ( numToMove > 0 )
385 Move( &mBuf[ inPos + inBytes + 1 ], &mBuf[ inPos + 1 ], numToMove );
386 if ( inBytes > 0 && inSrce )
387 Move( &mBuf[ inPos + 1 ], inSrce, inBytes );
388 }
389 }
390
391
392
393
394
395
396
Move(void * inDest,const void * inSrce,unsigned long inNumBytes)397 void UtilStr::Move( void* inDest, const void* inSrce, unsigned long inNumBytes ) {
398
399 if ( inNumBytes <= 64 ) {
400 register unsigned char* dest = (unsigned char*) inDest;
401 register unsigned char* srce = (unsigned char*) inSrce;
402 if ( srce > dest )
403 while ( inNumBytes > 0 ) {
404 *dest = *srce;
405 dest++;
406 srce++;
407 inNumBytes--;
408 }
409 else {
410 dest += inNumBytes;
411 srce += inNumBytes;
412 while ( inNumBytes > 0 ) {
413 dest--;
414 srce--;
415 *dest = *srce;
416 inNumBytes--;
417 } } }
418 else
419 memmove(inDest, inSrce, inNumBytes);
420 }
421
422
423
Decapitalize()424 void UtilStr::Decapitalize() {
425 unsigned long i, len = length();
426 unsigned char c, sp;
427
428 for( i = 2; i <= len; i++ ) {
429 c = getChar( i );
430 sp = getChar( i-1 );
431 if ( ( sp >= 'A' && sp <= 'Z' ) || ( sp >= 'a' && sp <= 'z' ) ) {
432 if ( getChar( i-1 ) == 'I' && c == 'I' ) { } // Prevent III from decapitalizing
433 else if ( c >= 'A' && c <= 'Z' )
434 setChar( i, c + 32 );
435 }
436 }
437
438 }
439
440
441
442
443
Capitalize()444 void UtilStr::Capitalize() {
445 unsigned long i, len = length();
446 char c;
447
448 for( i = 1; i <= len; i++ ) {
449 c = getChar( i );
450 if ( c >= 'a' && c <= 'z' ) {
451 setChar( i, c - 32 );
452 }
453 }
454
455 }
456
457
458
459
Trunc(unsigned long numToChop,bool fromRight)460 void UtilStr::Trunc( unsigned long numToChop, bool fromRight ) {
461
462 if ( fromRight )
463 Remove( length() - numToChop + 1, numToChop );
464 else
465 Remove( 1, numToChop );
466 }
467
468
469
470
Remove(unsigned long inPos,unsigned long inNum)471 void UtilStr::Remove( unsigned long inPos, unsigned long inNum ) {
472 unsigned long toMove;
473 unsigned long len = length();
474
475 if ( inPos < 1 )
476 inPos = 1;
477
478 if ( inNum > len - inPos + 1 )
479 inNum = len - inPos + 1;
480
481 if ( inPos <= len && inNum > 0 ) {
482
483 mStrLen = len - inNum;
484 toMove = len - inPos - inNum + 1;
485
486 if ( toMove > 0 )
487 Move( &mBuf[ inPos ], &mBuf[ inPos + inNum ], toMove );
488 }
489 }
490
491
492
493
Remove(char * inStr,int inLen,bool inCaseSensitive)494 void UtilStr::Remove( char* inStr, int inLen, bool inCaseSensitive ) {
495 long pos;
496 char* s;
497
498 if ( inLen < 0 ) {
499 inLen = 0;
500 s = inStr;
501 while ( *s ) {
502 inLen++;
503 s++;
504 }
505 }
506
507 pos = contains( inStr, inLen, 0, inCaseSensitive );
508 while ( pos > 0 ) {
509 Remove( pos, inLen );
510 pos--;
511 pos = contains( inStr, inLen, pos, inCaseSensitive );
512 }
513 }
514
515
516
517
518
Keep(unsigned long inNumToKeep)519 void UtilStr::Keep( unsigned long inNumToKeep ) {
520
521 if ( inNumToKeep < length() )
522 Trunc( length() - inNumToKeep, true );
523 }
524
525
526
527
PoliteKeep(unsigned long inMaxLen,unsigned long inPos)528 void UtilStr::PoliteKeep( unsigned long inMaxLen, unsigned long inPos ) {
529 if ( length() > inMaxLen ) {
530 Remove( inPos + 1, length() - inMaxLen + 1 );
531 setChar( inPos, '�' );
532 }
533 }
534
535
536
ZapLeadingSpaces()537 void UtilStr::ZapLeadingSpaces() {
538 unsigned long i = 1;
539 unsigned long len = length();
540
541 while ( getChar( i ) == ' ' && i <= len )
542 i++;
543
544 if ( i > 1 )
545 Trunc( i - 1, false );
546 }
547
548
549
550
551
552 /*
553
554
555 UtilStr UtilStr::MID( unsigned long start, unsigned long inLen ) const {
556 UtilStr newStr;
557 unsigned long len = length();
558
559 if ( start < 1 )
560 start = 1;
561
562 start--;
563
564 if ( inLen > len - start )
565 inLen = len - start;
566
567 if ( start <= len && inLen > 0 )
568 newStr.Assign( getCStr() + start, inLen );
569
570 return newStr;
571 }
572
573
574
575
576 UtilStr UtilStr::RIGHT( unsigned long inLen ) const {
577 UtilStr newStr;
578 unsigned long start, len = length();
579
580 if ( inLen > len )
581 inLen = len;
582
583 start = len - inLen;
584
585 newStr.Assign( getCStr() + start, inLen );
586 return newStr;
587 }
588 */
589
590
591
592
593
594
595
FindNextInstanceOf(long inPos,char c) const596 long UtilStr::FindNextInstanceOf( long inPos, char c ) const {
597 long len = length(), i;
598
599 if ( inPos < 0 )
600 inPos = 0;
601
602 for ( i = inPos+1; i <= len; i++ ) {
603 if ( mBuf[ i ] == c )
604 return i;
605 }
606
607 return 0;
608 }
609
610
FindPrevInstanceOf(long inPos,char c) const611 long UtilStr::FindPrevInstanceOf( long inPos, char c ) const {
612 long i;
613
614 if ( inPos > length() )
615 inPos = length();
616
617 for ( i = inPos; i > 0; i-- ) {
618 if ( mBuf[ i ] == c )
619 return i;
620 }
621
622 return 0;
623 }
624
625
Replace(char inTarget,char inReplacement)626 long UtilStr::Replace( char inTarget, char inReplacement ) {
627 unsigned long count, i, len = length();
628
629 count = 0;
630
631 for ( i = 1; i <= len; i++ ) {
632 if ( mBuf[ i ] == inTarget ) {
633 mBuf[ i ] = inReplacement;
634 count++;
635 }
636 }
637
638 return count;
639 }
640
641
Replace(char * inTarget,char * inReplacement,bool inCaseSensitive)642 long UtilStr::Replace( char* inTarget, char* inReplacement, bool inCaseSensitive ) {
643 char* srce;
644 long count = 0, pos, prevPos = 0;
645
646 // Calc the len of the target
647 long targLen = 0;
648 while ( inTarget[ targLen ] )
649 targLen++;
650
651 // See if there's at least one instance of the target in this string...
652 pos = contains( inTarget, targLen, 0, inCaseSensitive );
653 if ( pos ) {
654
655 // Make this the dest str
656 UtilStr srceStr( this );
657 srce = srceStr.getCStr();
658 count = 0;
659 Keep( pos - 1 );
660 goto _resume;
661
662 while ( pos ) {
663 Append( srce + prevPos, pos - prevPos - 1 );
664 _resume: Append( inReplacement );
665 count++;
666 prevPos = pos + targLen - 1;
667 pos = srceStr.contains( inTarget, targLen, prevPos, inCaseSensitive );
668 }
669
670 Append( srce + prevPos, srceStr.length() - prevPos );
671 }
672
673 return count;
674 }
675
copyTo(unsigned char * pasDestPtr,unsigned char inBytesToCopy) const676 void UtilStr::copyTo( unsigned char* pasDestPtr, unsigned char inBytesToCopy ) const {
677 unsigned long bytes = length() + 1;
678
679 if ( bytes > inBytesToCopy )
680 bytes = inBytesToCopy;
681
682 getPasStr(); // refreshes len byte for pas str
683
684 if ( bytes > 255 )
685 bytes = 255;
686
687 Move( pasDestPtr, &mBuf[0], bytes );
688 }
689
690
691
692
copyTo(char * inDestPtr,unsigned long inBytesToCopy) const693 void UtilStr::copyTo( char* inDestPtr, unsigned long inBytesToCopy ) const {
694 unsigned long bytes = length() + 1;
695
696 if ( bytes > inBytesToCopy )
697 bytes = inBytesToCopy;
698
699 getCStr(); // refreshes NUL byte for c strs
700
701
702 Move( inDestPtr, &mBuf[1], bytes );
703 }
704
705
706
707
708
709
710
getChar(unsigned long i) const711 char UtilStr::getChar( unsigned long i ) const {
712
713 if ( i <= mStrLen && i > 0 )
714 return mBuf[ i ];
715 else
716 return '\0';
717 }
718
719
720
setChar(unsigned long i,char inChar)721 void UtilStr::setChar( unsigned long i, char inChar ) {
722
723
724 if ( i <= mStrLen && i > 0 )
725 mBuf[ i ] = inChar;
726 }
727
728
729
730
731
StrCmp(const char * s1,const char * s2,long inN,bool inCaseSensitive)732 int UtilStr::StrCmp( const char* s1, const char* s2, long inN, bool inCaseSensitive ) {
733 char c1, c2;
734
735 if ( inN < 0 ) {
736 inN = 0;
737 const char* s = (*s1 != 0) ? s1 : s2;
738 while ( *s ) {
739 s++;
740 inN++;
741 }
742 }
743
744 while ( inN > 0 ) {
745 inN--;
746 c1 = *s1;
747 s1++;
748 c2 = *s2;
749 s2++;
750 if ( ! inCaseSensitive ) {
751 if ( c1 >= 'a' && c1 <= 'z' )
752 c1 -= 32;
753 if ( c2 >= 'a' && c2 <= 'z' )
754 c2 -= 32;
755 }
756 if ( c1 != c2 )
757 return c1 - c2;
758 }
759
760 return 0;
761 }
762
763
764
765
compareTo(const unsigned char * inPStr,bool inCaseSensitive) const766 int UtilStr::compareTo( const unsigned char* inPStr, bool inCaseSensitive ) const {
767
768 if ( inPStr ) {
769 if ( length() == inPStr[0] ) {
770 return StrCmp( getCStr(), (char*) (inPStr + 1), length(), inCaseSensitive );
771 }
772 }
773
774 return -1;
775 }
776
777
compareTo(const UtilStr * inStr,bool inCaseSensitive) const778 int UtilStr::compareTo( const UtilStr* inStr, bool inCaseSensitive ) const {
779
780 if ( inStr )
781 return StrCmp( inStr -> getCStr(), getCStr(), length() + 1, inCaseSensitive );
782 else
783 return -1;
784 }
785
786
compareTo(const char * inStr,bool inCaseSensitive) const787 int UtilStr::compareTo( const char* inStr, bool inCaseSensitive ) const {
788 if ( inStr )
789 return StrCmp( inStr, getCStr(), length() + 1, inCaseSensitive );
790 else
791 return -1;
792 }
793
794
795
contains(const char * inSrchStr,int inLen,int inStartingPos,bool inCaseSensitive) const796 long UtilStr::contains( const char* inSrchStr, int inLen, int inStartingPos, bool inCaseSensitive ) const {
797 char srchChar, srchCharLC, c;
798 char* endPtr, *curPtr = getCStr();
799
800 if ( inLen < 0 ) {
801 inLen = 0;
802 while ( *(inSrchStr+inLen) )
803 inLen++;
804 }
805
806 endPtr = curPtr + length() - inLen;
807
808 srchChar = *inSrchStr;
809 if ( srchChar >= 'a' && srchChar <= 'z' )
810 srchChar -= 32;
811 srchCharLC = srchChar + 32;
812 if ( inStartingPos > 0 )
813 curPtr += inStartingPos;
814
815 while ( curPtr <= endPtr ) {
816 c = *curPtr;
817 if ( c == srchChar || c == srchCharLC ) {
818 if ( StrCmp( curPtr, inSrchStr, inLen, inCaseSensitive ) == 0 )
819 return curPtr - getCStr() + 1;
820 }
821 curPtr++;
822 }
823
824 return 0;
825 }
826
827
828
829 /*
830 *** The following is some string silmilarity/matching theory... man, i miss cornell cs...
831
832 Dynamic Programming Algorithm for Sequence Alignment.
833 Dynamic Programming Algorithms are used for finding shortest paths in graphs, and in many other optimization problems, but in the comparison or alignment of strings (as in Biological DNA, RNA and protein sequence analysis, speech recognition and shape comparison) the following, or similar, is often called "the" dynamic programming algorithm (DPA).
834
835 Generic Dynamic Programming Algorithm for Comparing Two Strings:
836 Given two strings or sequences A[1..|A|] and B[1..|B|]
837
838 M[0, 0] = z -- usually z=0
839 M[i, 0] = f( M[i-1, 0 ], c(A[i], "_" ) ) -- Boundary conditions - delete A[i]
840 M[0, j] = f( M[0, j-1], c("_", B[j]) ) -- Boundary conditions - delete B[j]
841
842 M[i, j] = g( f( M[i-1, j-1], c(A[i], B[j]) ), -- match/mismatch
843 f( M[i-1, j ], c(A[i], "_" ) ), -- delete A[i]
844 f( M[i, j-1], c("_", B[j]) ) ) -- insert B[j]
845
846 Note that "_" represents the null (pseudo-)character.
847
848 M[i,j] represents the cost (or score) of the partial sequences A[1..i] and B[1..j], and the algorithm requires that M[i,j] can be calculated from the three neighbours of M[i,j] - to the north (M[i,j-1]), west (M[i-1,j]), and north west (M[i-1,j-1]).
849
850 c( x, y ): Cost of replacing character x with character y
851 f( ): String reconcatination function
852 g( ): Substring score chooser function
853
854 O( AB ) running time (Assming const running const running time for g, f and c)
855 O( A ) or O( B ) storage
856
857 So what are z, f(), g(), and c()? By varing them, the returned score comparison will follow different match criteria/behavior. Here's 5 sets of assignments:
858
859 1) === Longest Common Subsequence (LCS or LCSS)
860 z = 0
861 g( ) = min( )
862 f( ) = +
863 c(x,x) = 0, c(x,y) = c(x,"_") = c("_",x) = 1
864 A big LCS score means the sequences are similar - lots of matches c(x,x).
865 Note that an optimal LCS sequence alignment can be recovered either by retracing the `max' choices that were made from M[|A|,|B|] to M[0,0], or by using Hirschberg's (1975) divide and conquer technique.
866
867 2) === Levenshtein Metric or Sellers' Edit Distance
868 A big edit distance value means the sequences are dissimilar - lots of changes c(x,y), and indels c(x,"_") and c("_",x).
869
870 z = 0
871 g( ) = min( )
872 f( ) = +
873 c(x,x) = 0, c(x,y), c(x,"_"), c("_",x) > 0
874 The above assumes "simple" gap costs; linear but some other more complex gap costs can be incorporated with modifications.
875
876 3) === Probability of Alignments
877 For given probabilities, P(match), P(mismatch), P(insert) and P(delete), the following dynamic programming algorithm finds a most probable alignment of two given sequences:
878
879 z = 1 -- NB.
880 g( ) = max( )
881 f( ) = *
882 c(x,x) = P(match) * P(x)
883 c(x,y) = P(mismatch) * P(x,y | x!=y)
884 c(x,"_") = P(delete) * P(x)
885 c("_",x) = P(insert) * P(x)
886 Unfortunately the quantities calculated become very small for strings of realistic lengths and underflow is the consequence. It is more convenient to deal with the -log's of probabilities and this also corresponds to a coding or information theory interpretation - see below.
887
888 4) === Minimum Message Length MML Optimal Alignment
889 aka Minimum Description Length MDL
890
891 z = 0
892 g( ) = min( )
893 f( ) = +
894 c(x,x) = -log2(P(match)) - log2(P(x))
895 c(x,y) = -log2(P(mismatch)) - log2(P(x,y | x!=y))
896 c(x,"_") = -log2(P(delete)) - log2(P(x))
897 c("_",x) = -log2(P(insert) -log2(P(x))
898 - assuming "simple" gap costs; modify if not. Base-two logs are taken if you wish to interpret the quantities as bits; some prefer natural logarithms, giving nits.
899 An alignment can be thought of as a hypothesis of how strings A and B are related, and it can be compared with the null-theory that they are not related (Allison et al 1990(a), 1990(b), 1992). Thus, if an alignment gives some real data compression for the pair of strings, it is an acceptable hypothesis.
900
901 5) === Minimum Message Length R-Theory, Sum Over All Alignments
902 The r-theory (r for related) is the hypothesis that two strings are related in some unspecified way. Its (-log2) probability is calculated as above except that
903
904 g(,) = logplus(,)
905 where logplus(-log2(P1), -log2(P2)) = -log2(P1+P2)
906 One alignment is just one hypothesis of how one string changed into the other. There may be many optimal alignments and many more suboptimal alignments. Two alignments are two different or exclusive hypotheses so their probabilities can be added.
907
908 Thus the sum over all alignments gives the -log2 probability of the two strings being created in a related but unspecified way. The complement of this r-theory is that the strings are not related. Comparing the message lengths of the r-theory and the null-theory gives the posterior -log odds-ratio that the two strings are related (for a given model of string relation of course):
909
910 r-theory:
911 P(A & B & related) = P(A & B).P(related | A & B)
912 = P(related).P(A & B | related)
913 where P(A & B | related) = Sum[all L] P(A & B | alignment L)
914
915 null-theory:
916 P(A & B & not related) = P(not related).P(A & B | not related)
917 = P(not related).P(A).P(B)
918 = P(A & B).P(not related | A & B)
919
920
921 We can put a 50:50 prior on being related or unrelated. We do not know the prior probability of the data, P(A&B), but we can cancel it out to get the posterior -log-odds ratio of the r-theory and the null-theory.
922
923 It is fortunate that the sum of P(A&B|L) over all alignments L can be calculated in O(|A|*|B|) time for "finite-state" models of relation (based on finite-state machines) using the dynamic programming algorithm above and variations upon it (Allison et al 1990(a), 1990(b), 1992).
924
925 References.
926 L. Allison, C. S. Wallace and C. N. Yee. When is a String like a String? AI & Maths 1990(a)
927 L. Allison, C. S. Wallace and C. N. Yee. Inductive inference over macro-molecules. TR 90/148 Department of Computer Science, Monash University, November 1990(b)
928 L. Allison, C. S. Wallace and C. N. Yee. Finite-State Models in the Alignment of Macromolecules. Jrnl. Molec. Evol. 35 77-89 1992
929 D. S. Hirschberg. A Linear Space Algorithm for Computing Maximal Common Subsequences. Comm. Assoc. Comp. Mach. 18(6) 341-343 1975
930 V. I. Levenshtein. Binary Codes Capable of Correcting Deletions, Insertions and Reversals. Soviet Physics Doklady 10(8) 707-710 1966 and Doklady Akademii Nauk SSSR 163(4) 845-848 1965
931 P. Sellers. On the Theory and Computation of Evolutionary Distances. SIAM J. Appl. Math 26(4) 787-793 1974
932 Lloyd Allison, Department of Computer Science, Monash University, Australia 3168
933 */
934
935 // An implementaion of the Longest Common Subsequece function set for the general Sequence Alignment alg described above
936 #define __MIN( a, b ) ( ( (a) < (b) ) ? (a) : (b) )
937 #define STACK_TEMP_SIZE 30
938 #define COST_CONTIGUOUS 1
939 #define COST_OF_DEL 1 /* Cost of having to delete chars from this to match inStr when preceding char matches */
940 #define COST_OF_INS 16 /* Cost of having to insert chars to this to match inStr. higher cost than DEL to penalize chars contained in inStr but not in this */
941 #define COST_OF_CHANGE 17 /* Cose of changing a char in this to match inStr */
942 #define COST_OF_CASE 1 /* Cose of having to change the case of a char in this to match a char in inStr */
943
LCSMatchScore(const char * inStr,long ALen) const944 long UtilStr::LCSMatchScore( const char* inStr, long ALen ) const {
945 const char *A, *B;
946 long* M;
947 long temp[ STACK_TEMP_SIZE + 1 ];
948 long BLen, a, b, M_bm1, ins, del, mat_mis, c_a, c_b, c_bUC, prev_b_UC, cost;
949
950 // Calc the length if it wasn't given to us
951 if ( ALen < 0 ) {
952 ALen = 0;
953 while ( *(inStr+ALen) )
954 ALen++;
955 }
956
957 /* Subtract 1 from the str address to account for the fact that all accesses
958 are 1 based indexing */
959 A = inStr - 1;
960 B = getCStr() - 1;
961 BLen = length();
962
963
964 /* let A and B be the two strings to be compared.
965 We need to make a 2D table, M[][], that has corner M[ |A| ][ |B| ]. If we evaluate M, row by row, we never
966 need to access a row more further than one row. This permits us to evaluate [ |A| ][ |B| ] with only having
967 to allocate a single row's worth of elements (vs. allocating all |A| rows). */
968
969 // Don't use stack-allocated temp mem if our string is too big...
970 // Top most significant byte in M[][] contains info!
971 if ( ALen < STACK_TEMP_SIZE )
972 M = temp;
973 else
974 M = new long[ ALen + 1 ]; // +1 for the nul str column/representation
975
976 // Initialize the topmost row of M[][] ...
977 M[ 0 ] = 0;
978 for ( a = 1; a <= ALen; a++ )
979 M[ a ] = M[ a - 1 ] + COST_OF_INS;
980
981 // Evaluate from row 2 to row BLen
982 c_bUC = 0;
983 for ( b = 1; b <= BLen; b++ ) {
984 prev_b_UC = c_bUC;
985 c_bUC = c_b = B[ b ];
986 if ( c_bUC >= 'a' && c_bUC <= 'z' )
987 c_bUC -= 32;
988
989 // Evaluate M[ 0, b ] via boundry conditions
990 M_bm1 = M[ 0 ];
991 M[ 0 ] += COST_OF_DEL;
992
993 // Evaluate from column 2 to column ALen
994 for ( a = 1; a <= ALen; a++ ) {
995
996 // === Calc NW term...
997 // Calc the cost of changing B[b] to A[a]...
998 c_a = A[ a ];
999 cost = 0;
1000 if ( c_a != c_b ) {
1001 if ( c_a >= 'a' && c_a <= 'z' )
1002 c_a -= 32;
1003
1004 if ( c_a != c_bUC )
1005 cost = COST_OF_CHANGE;
1006 else
1007 cost = COST_OF_CASE;
1008 }
1009 // NW Term: M_bm1 is M[ a - 1, b - 1 ] (replace A[a] with B[b])
1010 mat_mis = M_bm1 + cost;
1011
1012 // === Calc N term...
1013 // Calc the cost of changing B[b] to nul. Favor contiguity.
1014 // Favor contiguity (case insensitive--any case match is 'contiguous')
1015 cost = COST_OF_DEL;
1016 if ( c_a == prev_b_UC )
1017 cost += COST_CONTIGUOUS;
1018
1019 // N Term: M[ a ] is M[ a, b - 1 ] ( delete B[b] )
1020 del = M[ a ] + cost;
1021
1022 // === Calc W term...
1023 // W Term: M[ a - 1 ] is M[ a - 1, b ] ( insert A[a] )
1024 ins = M[ a - 1 ] + COST_OF_INS;
1025
1026 // Maintain M_bm1 for next iteration of a
1027 M_bm1 = M[ a ];
1028
1029 // Choose the lost cost direction: M[ a, b ] = max( mat_mis, delAa, insBb )
1030 M[ a ] = __MIN( del, ins );
1031 M[ a ] = __MIN( M[ a ], mat_mis );
1032 }
1033 }
1034
1035 if ( ALen >= STACK_TEMP_SIZE )
1036 delete []M;
1037
1038 return 100000 - M[ ALen ];
1039 }
1040
1041
1042 /*
1043 bool UtilStr::equalTo( const char* inStr, int inFlags ) const {
1044 bool stillOk = inStr != 0;
1045 char* thisStr = getCStr();
1046 char c1 = 1, c2 = 2;
1047 bool caseInsensitive = inFlags & cCaseInsensitive != 0;
1048
1049 while ( stillOk && c2 != 0 ) {
1050 c1 = *inStr;
1051 c2 = *thisStr;
1052 if ( caseInsensitive ) {
1053 if ( c1 >= 'a' && c1 <= 'z' )
1054 c1 -= 32;
1055 if ( c2 >= 'a' && c2 <= 'z' )
1056 c2 -= 32;
1057 }
1058 stillOk = c1 == c2;
1059 thisStr += 1;
1060 inStr += 1;
1061 }
1062
1063 if ( ( inFlags & cLefthand ) && c2 == 0 )
1064 stillOk = true;
1065
1066 return stillOk;
1067 }
1068 */
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
GetIntValue(char * inStr,long inLen,long * outPlacePtr)1080 long UtilStr::GetIntValue( char* inStr, long inLen, long* outPlacePtr ) {
1081 bool seenNum = false;
1082 char c;
1083 long i, ret = 0, place = 1;
1084
1085 for ( i = inLen - 1; i >= 0; i-- ) {
1086 c = inStr[ i ];
1087 if ( c >= '0' && c <= '9' ) {
1088 seenNum = true;
1089 ret += place * ( c - '0' );
1090 place *= 10; }
1091 else if ( seenNum )
1092 i = 0; // Stop loop
1093 }
1094
1095 if ( outPlacePtr )
1096 *outPlacePtr = place;
1097
1098 return ret;
1099 }
1100
1101
1102
1103
1104
GetFloatVal(char * inStr,long inLen)1105 double UtilStr::GetFloatVal( char* inStr, long inLen ) {
1106 unsigned long i, decLoc = 0, foundLet = false;
1107 char c;
1108 double n = 0, place = 1.0;
1109 bool isNeg = false;
1110
1111
1112 for ( i = 0; i < inLen; i++ ) {
1113 c = inStr[ i ];
1114
1115 if ( c == '-' && ! foundLet )
1116 isNeg = true;
1117
1118 if ( c >= '0' && c <= '9' ) {
1119 n = 10.0 * n + ( c - '0' );
1120 if ( decLoc )
1121 place *= 10.0;
1122 }
1123
1124 if ( c != ' ' )
1125 foundLet = true;
1126
1127 if ( c == '.' )
1128 decLoc = i+1;
1129 }
1130
1131 if ( isNeg )
1132 n = - n;
1133
1134 return n / place;
1135 }
1136
1137
1138
1139
GetFloatValue() const1140 double UtilStr::GetFloatValue() const {
1141 return GetFloatVal( mBuf + 1, mStrLen );
1142 }
1143
1144
1145
1146
GetValue(long inMultiplier) const1147 long UtilStr::GetValue( long inMultiplier ) const {
1148 unsigned long i, len = length(), decLoc = 0, foundLet = false;
1149 char c;
1150 long left, right, place;
1151
1152 for ( i = 1; i <= len; i++ ) {
1153 c = mBuf[ i ];
1154
1155 if ( c == '-' && ! foundLet )
1156 inMultiplier *= -1;
1157
1158 if ( c != ' ' )
1159 foundLet = true;
1160
1161 if ( c == '.' )
1162 decLoc = i;
1163 }
1164
1165 if ( ! decLoc )
1166 decLoc = len + 1;
1167
1168 left = GetIntValue( mBuf + 1, decLoc - 1 );
1169 right = GetIntValue( mBuf + decLoc + 1, len - decLoc, &place );
1170
1171 return ( inMultiplier * right + place / 2 ) / place + inMultiplier * left;
1172 }
1173
1174
1175
1176
SetValue(long inVal,long inDivisor,int inNumDecPlaces)1177 void UtilStr::SetValue( long inVal, long inDivisor, int inNumDecPlaces ) {
1178 long i, part = inVal % inDivisor;
1179 UtilStr partStr;
1180
1181 for ( i = 0; i < inNumDecPlaces; i++ )
1182 part *= 10;
1183
1184 part /= inDivisor;
1185
1186 i = inVal / inDivisor;
1187 if ( i != 0 || part <= 0 )
1188 Assign( i );
1189 else
1190 Wipe();
1191
1192 if ( part > 0 ) {
1193 Append( '.' );
1194 partStr.Append( part );
1195
1196 for ( i = inNumDecPlaces - partStr.length(); i > 0; i-- )
1197 Append( '0' );
1198
1199 Append( &partStr );
1200 while ( getChar( length() ) == '0' )
1201 Trunc( 1 );
1202 }
1203
1204 }
1205
1206
1207
SetFloatValue(float inValue,int inPercision)1208 void UtilStr::SetFloatValue( float inValue, int inPercision ) {
1209 int deci_digits, left_digits = log10( fabs( inValue ) ) + 1.00001;
1210 float scale;
1211
1212 if ( left_digits < 9 ) {
1213 deci_digits = 10 - left_digits;
1214 if ( deci_digits > inPercision )
1215 deci_digits = inPercision;
1216 scale = pow( 10, deci_digits );
1217 SetValue( inValue * scale, scale, deci_digits ); }
1218 else
1219 Assign( "Overflow" );
1220 }
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
AppendAsMeta(const UtilStr * inStr)1235 void UtilStr::AppendAsMeta( const UtilStr* inStr ) {
1236
1237 if ( inStr )
1238 AppendAsMeta( inStr -> getCStr(), inStr -> length() );
1239 }
1240
1241
1242
1243
AppendAsMeta(const void * inPtr,long inLen)1244 void UtilStr::AppendAsMeta( const void* inPtr, long inLen ) {
1245 const unsigned char* ptr = (unsigned char*) inPtr;
1246 unsigned char c;
1247
1248 Append( '"' );
1249
1250 if ( ptr ) {
1251 while ( inLen > 0 ) {
1252 c = *ptr;
1253
1254 if ( c == '"' ) // Our flag was detected...
1255 Append( (char) '"' ); // Two flags in a row signal an actual "
1256
1257 if ( c < 32 || c > 127 ) {
1258 Append( (char) '"' );
1259 Append( (long) c );
1260 Append( (char) '"' ); }
1261 else
1262 Append( &c, 1 );
1263 inLen--;
1264 ptr++;
1265 }
1266 }
1267 Append( '"' );
1268 }
1269
1270
1271
1272
AppendFromMeta(const void * inPtr,long inLen)1273 void UtilStr::AppendFromMeta( const void* inPtr, long inLen ) {
1274 const unsigned char* ptr = (unsigned char*) inPtr;
1275 unsigned char c;
1276 // FIXME: ascNum was declared static but that caused xmms to
1277 // segfault on exit. I'm not sure if declaring this static is
1278 // a good idea, but the point is there may be a deeper problem.
1279 UtilStr ascNum;
1280
1281 if ( ptr ) {
1282 if ( *ptr != '"' ) // First char should be a "
1283 return;
1284 inLen--;
1285 ptr++;
1286
1287 while ( inLen > 1 ) { // We stop 1 char early cuz last char should be a "
1288 c = *ptr;
1289
1290 if ( c == '"' ) { // If the flag is detected...
1291 inLen--;
1292 ptr++;
1293 c = *ptr;
1294 if ( inLen > 1 && c != '"' ) { // Ignore double flags (they signify the flag char)
1295 ascNum.Wipe();
1296 while ( c >= '0' && c <= '9' ) {
1297 ascNum.Append( (char) c );
1298 inLen--;
1299 ptr++;
1300 c = *ptr;
1301 }
1302 c = ascNum.GetValue();
1303 }
1304 }
1305
1306 Append( (char) c );
1307
1308 inLen--;
1309 ptr++;
1310 }
1311 }
1312 }
1313
1314
1315
1316
1317
1318
1319
operator +(const UtilStr & inStr)1320 UtilStr UtilStr::operator + ( const UtilStr& inStr ) {
1321 UtilStr newStr( this );
1322
1323 newStr.Append( &inStr );
1324 return newStr;
1325 }
1326
1327
1328
operator +(const char * inCStr)1329 UtilStr UtilStr::operator + ( const char* inCStr ) {
1330 UtilStr newStr( this );
1331
1332 newStr.Append( inCStr );
1333 return newStr;
1334 }
1335
1336
1337
operator +(long inNum)1338 UtilStr UtilStr::operator + ( long inNum ) {
1339 UtilStr newStr( this );
1340
1341 newStr.Append( inNum );
1342 return newStr;
1343 }
1344
1345
1346
1347
operator =(const UtilStr & inStr)1348 UtilStr& UtilStr::operator = ( const UtilStr& inStr ) {
1349 Wipe();
1350
1351 Append( inStr.getCStr() );
1352 return *this;
1353 }
1354
1355
1356
1357
1358
1359
1360
Hash() const1361 long UtilStr::Hash() const {
1362 long hash = 0;
1363 const char* stop = getCStr();
1364 const char* curPos = stop + mStrLen - 1;
1365
1366 if ( mStrLen < 16 ) {
1367 // Sample all the characters
1368 while ( curPos >= stop ) {
1369 hash = ( hash * 37 ) + *curPos;
1370 curPos--;
1371 } }
1372 else {
1373 // only sample some characters
1374 int skip = mStrLen / 7;
1375 while ( curPos >= stop ) {
1376 hash = ( hash * 39 ) + *curPos;
1377 curPos -= skip;
1378 }
1379 }
1380
1381 return hash;
1382 }
1383
1384
1385
Equals(const Hashable * inComp) const1386 bool UtilStr::Equals( const Hashable* inComp ) const {
1387
1388 return compareTo( (UtilStr*) inComp ) == 0;
1389 }
1390
1391
1392
1393
1394
AppendHex(char inB1,char inB2)1395 void UtilStr::AppendHex( char inB1, char inB2 ) {
1396 unsigned char c;
1397
1398 if ( inB1 >= '0' && inB1 <= '9' )
1399 inB1 -= '0';
1400 else
1401 inB1 = 9 + inB1 & 0xF;
1402 c = inB1 << 4;
1403
1404 if ( inB2 >= '0' && inB2 <= '9' )
1405 c += (inB2 - '0');
1406 else
1407 c += 9 + inB2 & 0xF;
1408
1409
1410 Append( (char) c );
1411 }
1412
1413
1414
1415
1416 /*
1417
1418
1419
1420 long UtilStr::Hash( const char* inStr, long inStrLen ) {
1421 long hash = 0;
1422 const char* curPos = inStr + inStrLen - 1;
1423
1424 if ( inStrLen < 0 )
1425 inStrLen = inStr's len
1426
1427 if ( inStrLen < 16 ) {
1428 // Sample all the characters
1429 while ( curPos >= inStr ) {
1430 hash = ( hash * 37 ) + *curPos;
1431 curPos--;
1432 } }
1433 else {
1434 // only sample some characters
1435 int skip = inStrLen / 7;
1436 while ( curPos >= inStr ) {
1437 hash = ( hash * 39 ) + *curPos;
1438 curPos -= skip;
1439 }
1440 }
1441
1442 return hash;
1443 }(*/
1444
1445
1446
1447
1448