1 
2 #include "UtilStr.h"
3 
4 
5 #include "CEgIStream.h"
6 #include "CEgOStream.h"
7 
8 #include <math.h>
9 
10 #include <string.h>
11 
12 
UtilStr()13 UtilStr::UtilStr() {
14 
15 	init();
16 }
17 
18 
19 
UtilStr(const char * inCStr)20 UtilStr::UtilStr( const char* inCStr ) {
21 	init();
22 	Append( inCStr );
23 }
24 
25 
26 
27 
UtilStr(const unsigned char * inStrPtr)28 UtilStr::UtilStr( const unsigned char* inStrPtr ) {
29 	init();
30 	Append( inStrPtr );
31 }
32 
33 
34 
UtilStr(const UtilStr & inStr)35 UtilStr::UtilStr( const UtilStr& inStr ) {
36 	init();
37 	Append( &inStr );
38 }
39 
40 
41 
UtilStr(const UtilStr * inStr)42 UtilStr::UtilStr( const UtilStr* inStr ) {
43 	init();
44 
45 	Append( inStr );
46 }
47 
48 
49 
UtilStr(long inNum)50 UtilStr::UtilStr( long inNum ) {
51 	init();
52 	Append( inNum );
53 }
54 
55 
UtilStr(const void * inPtr,long bytes)56 UtilStr::UtilStr( const void* inPtr, long bytes ) {
57 
58 	init();
59 	Append( inPtr, bytes );
60 }
61 
62 
63 
64 #if 0
65 #include <stdio.h>
66 #endif
~UtilStr()67 UtilStr::~UtilStr() {
68 #if 0
69 fprintf(stderr, "~UtilStr()->(this = %.8X mBuf = %.8X)\n", this, mBuf);
70 #endif
71 	if ( mBuf )
72 		delete mBuf;
73 #if 0
74 fprintf(stderr, "<-~UtilStr()\n");
75 #endif
76 }
77 
78 
79 
init()80 void UtilStr::init() {
81 
82 	mStrLen			= 0;
83 	mBufSize 		= 0;
84 	mBuf			= 0;
85 }
86 
87 
88 
Swap(UtilStr & ioStr)89 void UtilStr::Swap( UtilStr& ioStr ) {
90 	long len = mStrLen, size = mBufSize;
91 	char* buf = mBuf;
92 
93 	mBuf			= ioStr.mBuf;
94 	mStrLen			= ioStr.mStrLen;
95 	mBufSize		= ioStr.mBufSize;
96 
97 	ioStr.mBuf		= buf;
98 	ioStr.mStrLen	= len;
99 	ioStr.mBufSize	= size;
100 }
101 
102 
103 
getPasStr() const104 unsigned char* UtilStr::getPasStr() const {
105 
106 	if ( ! mBuf )
107 		return (unsigned char*) "\0";
108 
109 	if ( mStrLen < 255 )
110 		mBuf[0] = mStrLen;
111 	else
112 		mBuf[0] = 255;
113 
114 	return (unsigned char*) &mBuf[0];
115 }
116 
117 
118 
119 
getCStr() const120 char* UtilStr::getCStr() const {
121 
122 	if ( mBuf ) {
123 		mBuf[ mStrLen + 1 ] = '\0';
124 		return &mBuf[1]; }
125 	else
126 		return "\0";
127 
128 }
129 
130 
131 
132 
133 
134 
135 
Append(const char * inCStr)136 void UtilStr::Append( const  char* inCStr ) {
137 	register unsigned long i = 0;
138 
139 	if ( inCStr ) {
140 		while( inCStr[ i ] != '\0' )
141 			i++;
142 		Append( inCStr, i );
143 	}
144 }
145 
146 
147 
Append(const unsigned char * inStrPtr)148 void UtilStr::Append( const unsigned char* inStrPtr ) {
149 
150 	if ( inStrPtr )
151 		Append( (char*) &inStrPtr[1], inStrPtr[0] );
152 
153 }
154 
155 
156 
Append(const void * inSrce,long numBytes)157 void UtilStr::Append( const void* inSrce, long numBytes ) {
158 	unsigned long newLen = numBytes + mStrLen;
159 	char* oldBuf;
160 
161 	if ( numBytes > 0 ) {
162 		if ( newLen >= mBufSize ) {
163 			if ( newLen < 80 )
164 				mBufSize = newLen + 5;
165 			else if ( newLen < 500 )
166 				mBufSize = newLen + 100;
167 			else mBufSize = newLen + 3000;
168 
169 			oldBuf = mBuf;
170 			mBuf = new char[ mBufSize + 2 ];	// One for pascal byte, one for c NUL byte
171 			if ( oldBuf ) {
172 				if ( mStrLen > 0 )
173 					Move( &mBuf[1], &oldBuf[1], mStrLen );
174 
175 				delete oldBuf;
176 			}
177 		}
178 
179 		if ( inSrce && numBytes > 0 )
180 			Move( &mBuf[mStrLen + 1], inSrce, numBytes );
181 
182 		mStrLen = newLen;
183 	}
184 }
185 
186 
187 
188 
189 
Append(long inNum)190 void UtilStr::Append( long inNum ) {
191 	UtilStr temp;
192 	unsigned long i;
193 
194 	if ( inNum < 0 ) {
195 		Append( '-' );
196 		inNum = - inNum;
197 	}
198 
199 	if ( inNum == 0 )
200 		Append( '0' );
201 
202 	while ( inNum > 0 ) {
203 		temp.Append( (char) ('0' + inNum % 10) );
204 		inNum = inNum / 10;
205 	}
206 
207 	for ( i = temp.length(); i > 0; i-- )
208 		Append( temp.getChar( i ) );
209 }
210 
211 
212 
213 
214 
215 
216 
Append(const UtilStr * inStr)217 void UtilStr::Append( const UtilStr* inStr ) {
218 
219 	if ( inStr )
220 		Append( inStr -> getCStr(), inStr -> length() );
221 }
222 
223 
224 
225 
226 
227 
Assign(const UtilStr & inStr)228 void UtilStr::Assign( const UtilStr& inStr ) {
229 
230 	if ( this != &inStr ) {
231 		Wipe();
232 		Append( inStr.getCStr(), inStr.length() );
233 	}
234 }
235 
236 
237 
Assign(const void * inPtr,long bytes)238 void UtilStr::Assign( const void* inPtr, long bytes ) {
239 	Wipe();
240 	Append( inPtr, bytes );
241 }
242 
243 
244 
Assign(char inChar)245 void UtilStr::Assign( char inChar ) {
246 	Wipe();
247 	Append( &inChar, 1 );
248 }
249 
250 
251 
Assign(long inNum)252 void UtilStr::Assign( long inNum ) {
253 
254 	Wipe();
255 	Append( inNum );
256 }
257 
258 
259 
260 
261 
262 
263 
264 
Assign(const unsigned char * inStrPtr)265 void UtilStr::Assign( const unsigned char* inStrPtr ) {
266 	Wipe();
267 	Append( inStrPtr );
268 }
269 
270 
271 
272 
273 
Assign(const UtilStr * inStr)274 void UtilStr::Assign( const UtilStr* inStr ) {
275 
276 
277 	if ( inStr != this ) {
278 		Wipe();
279 		if ( inStr )
280 			Append( inStr -> getCStr(), inStr -> length() );
281 	}
282 }
283 
284 
285 
Assign(CEgIStream & inStream,long numBytes)286 void UtilStr::Assign( CEgIStream& inStream, long numBytes ) {
287 
288 	if ( numBytes > 5000000 )						// *** Safe to say that sizes over this are corrupt?
289 		inStream.throwErr( cCorrupted );
290 	else if ( numBytes > 0 ) {
291 		Dim( numBytes );
292 		if ( length() < numBytes )
293 			numBytes = length();
294 		inStream.GetBlock( getCStr(), numBytes );
295 	}
296 }
297 
298 
299 
300 
301 
302 
ReadFrom(CEgIStream * inStream)303 void UtilStr::ReadFrom( CEgIStream* inStream ) {
304 	unsigned long len = inStream -> GetLong();		// Grab the length
305 
306 	Assign( *inStream, len );
307 }
308 
309 
310 
311 
WriteTo(CEgOStream * inStream) const312 void UtilStr::WriteTo( CEgOStream* inStream ) const {
313 
314 	inStream -> PutLong( length() );
315 	inStream -> PutBlock( getCStr(), length() );
316 }
317 
318 
319 
320 
321 
322 
Prepend(char inChar)323 void UtilStr::Prepend( char inChar ) {
324 
325 	Insert( 0, &inChar, 1 );
326 }
327 
328 
Prepend(UtilStr & inStr)329 void UtilStr::Prepend( UtilStr& inStr ) {
330 
331 	Insert( 0, inStr.getCStr(), inStr.length() );
332 }
333 
334 
335 
Prepend(const char * inStr)336 void UtilStr::Prepend( const char* inStr ) {
337 	long len = 0;
338 
339 	while ( *(char*)(inStr + len) != 0 )
340 		len++;
341 
342 	Insert( 0, inStr, len );
343 }
344 
345 
Insert(unsigned long inPos,char inChar,long inNumTimes)346 void UtilStr::Insert( unsigned long inPos, char inChar, long inNumTimes ) {
347 	unsigned long	oldLen = length();
348 	unsigned long	numAddable;
349 
350 	if ( inPos > oldLen )
351 		inPos = oldLen;
352 
353 	Insert( inPos, (char*) 0, inNumTimes );
354 	numAddable = length() - oldLen;
355 	if ( numAddable > 0 && mBuf ) {
356 		while ( inNumTimes > 0 ) {
357 			mBuf[ inPos + inNumTimes ] = inChar;
358 			inNumTimes--;
359 		}
360 	}
361 }
362 
363 
Insert(unsigned long inPos,long inNum)364 void UtilStr::Insert( unsigned long inPos, long inNum ) {
365 	UtilStr numStr( inNum );
366 
367 	Insert( inPos, numStr );
368 
369 }
370 
Insert(unsigned long inPos,const UtilStr & inStr)371 void UtilStr::Insert( unsigned long inPos, const UtilStr& inStr ) {
372 	Insert( inPos, inStr.getCStr(), inStr.length() );
373 }
374 
375 
Insert(unsigned long inPos,const char * inSrce,long inBytes)376 void UtilStr::Insert( unsigned long inPos, const char* inSrce, long inBytes ) {
377 	unsigned long numToMove, len = length();
378 
379 	if ( inPos >= len )
380 		Append( inSrce, inBytes );
381 	else if ( inBytes > 0 ) {
382 		Append( 0, inBytes );
383 		numToMove = len - inPos;
384 		if ( numToMove > 0 )
385 			Move( &mBuf[ inPos + inBytes + 1 ], &mBuf[ inPos + 1 ], numToMove );
386 		if ( inBytes > 0 && inSrce )
387 			Move( &mBuf[ inPos + 1 ], inSrce, inBytes );
388 	}
389 }
390 
391 
392 
393 
394 
395 
396 
Move(void * inDest,const void * inSrce,unsigned long inNumBytes)397 void UtilStr::Move( void* inDest, const void* inSrce, unsigned long inNumBytes ) {
398 
399 	if ( inNumBytes <= 64 ) {
400 		register unsigned char* dest = (unsigned char*) inDest;
401 		register unsigned char* srce = (unsigned char*) inSrce;
402 		if ( srce > dest )
403 			while ( inNumBytes > 0 ) {
404 				*dest = *srce;
405 				dest++;
406 				srce++;
407 				inNumBytes--;
408 			}
409 		else {
410 			dest += inNumBytes;
411 			srce += inNumBytes;
412 			while ( inNumBytes > 0 ) {
413 				dest--;
414 				srce--;
415 				*dest = *srce;
416 				inNumBytes--;
417 			} } }
418 	else
419 	        memmove(inDest, inSrce, inNumBytes);
420 }
421 
422 
423 
Decapitalize()424 void UtilStr::Decapitalize() {
425 	unsigned long i, len = length();
426 	unsigned char c, sp;
427 
428 	for( i = 2; i <= len; i++ ) {
429 		c = 	getChar( i );
430 		sp = 	getChar( i-1 );
431 		if ( ( sp >= 'A' && sp <= 'Z' ) || ( sp >= 'a' && sp <= 'z' ) )  {
432 			if ( getChar( i-1 ) == 'I' && c == 'I' ) { }	// Prevent III from decapitalizing
433 			else if ( c >= 'A' && c <= 'Z' )
434 				setChar( i, c + 32 );
435 		}
436 	}
437 
438 }
439 
440 
441 
442 
443 
Capitalize()444 void UtilStr::Capitalize() {
445 	unsigned long i, len = length();
446 	char c;
447 
448 	for( i = 1; i <= len; i++ ) {
449 		c = getChar( i );
450 		if ( c >= 'a' && c <= 'z' )  {
451 			setChar( i, c - 32 );
452 		}
453 	}
454 
455 }
456 
457 
458 
459 
Trunc(unsigned long numToChop,bool fromRight)460 void UtilStr::Trunc( unsigned long numToChop, bool fromRight ) {
461 
462 	if ( fromRight )
463 		Remove( length() - numToChop + 1, numToChop );
464 	else
465 		Remove( 1, numToChop );
466 }
467 
468 
469 
470 
Remove(unsigned long inPos,unsigned long inNum)471 void UtilStr::Remove( unsigned long inPos, unsigned long inNum ) {
472 	unsigned long	toMove;
473 	unsigned long	len		= length();
474 
475 	if ( inPos < 1 )
476 		inPos = 1;
477 
478 	if ( inNum > len - inPos + 1 )
479 		inNum = len - inPos + 1;
480 
481 	if ( inPos <= len && inNum > 0 ) {
482 
483 		mStrLen = len - inNum;
484 		toMove = len - inPos - inNum + 1;
485 
486 		if ( toMove > 0 )
487 			Move( &mBuf[ inPos ], &mBuf[ inPos + inNum ], toMove );
488 	}
489 }
490 
491 
492 
493 
Remove(char * inStr,int inLen,bool inCaseSensitive)494 void UtilStr::Remove( char* inStr, int inLen, bool inCaseSensitive ) {
495 	long pos;
496 	char* s;
497 
498 	if ( inLen < 0 ) {
499 		inLen = 0;
500 		s = inStr;
501 		while ( *s ) {
502 			inLen++;
503 			s++;
504 		}
505 	}
506 
507 	pos = contains( inStr, inLen, 0, inCaseSensitive );
508 	while ( pos > 0 ) {
509 		Remove( pos, inLen );
510 		pos--;
511 		pos = contains( inStr, inLen, pos, inCaseSensitive );
512 	}
513 }
514 
515 
516 
517 
518 
Keep(unsigned long inNumToKeep)519 void UtilStr::Keep( unsigned long inNumToKeep ) {
520 
521 	if ( inNumToKeep < length() )
522 		Trunc( length() - inNumToKeep, true );
523 }
524 
525 
526 
527 
PoliteKeep(unsigned long inMaxLen,unsigned long inPos)528 void UtilStr::PoliteKeep( unsigned long inMaxLen, unsigned long inPos ) {
529 	if ( length() > inMaxLen ) {
530 		Remove( inPos + 1, length() - inMaxLen + 1 );
531 		setChar( inPos, '�' );
532 	}
533 }
534 
535 
536 
ZapLeadingSpaces()537 void UtilStr::ZapLeadingSpaces() {
538 	unsigned long 	i 		= 1;
539 	unsigned long	len		= length();
540 
541 	while ( getChar( i ) == ' ' && i <= len )
542 		i++;
543 
544 	if ( i > 1 )
545 		Trunc( i - 1, false );
546 }
547 
548 
549 
550 
551 
552 /*
553 
554 
555 UtilStr UtilStr::MID( unsigned long start, unsigned long inLen ) const {
556 	UtilStr		newStr;
557 	unsigned long	len = length();
558 
559 	if ( start < 1 )
560 		start = 1;
561 
562 	start--;
563 
564 	if ( inLen > len - start )
565 		inLen = len - start;
566 
567 	if ( start <= len && inLen > 0 )
568 		newStr.Assign( getCStr() + start, inLen );
569 
570 	return newStr;
571 }
572 
573 
574 
575 
576 UtilStr UtilStr::RIGHT( unsigned long inLen ) const {
577 	UtilStr		newStr;
578 	unsigned long 	start, len = length();
579 
580 	if ( inLen > len )
581 		inLen = len;
582 
583 	start = len - inLen;
584 
585 	newStr.Assign( getCStr() + start, inLen );
586 	return newStr;
587 }
588 */
589 
590 
591 
592 
593 
594 
595 
FindNextInstanceOf(long inPos,char c) const596 long UtilStr::FindNextInstanceOf( long inPos, char c ) const {
597 	long len = length(), i;
598 
599 	if ( inPos < 0 )
600 		inPos = 0;
601 
602 	for ( i = inPos+1; i <= len; i++ ) {
603 		if ( mBuf[ i ] == c )
604 			return i;
605 	}
606 
607 	return 0;
608 }
609 
610 
FindPrevInstanceOf(long inPos,char c) const611 long UtilStr::FindPrevInstanceOf( long inPos, char c ) const {
612 	long i;
613 
614 	if ( inPos > length() )
615 		inPos = length();
616 
617 	for ( i = inPos; i > 0; i-- ) {
618 		if ( mBuf[ i ] == c )
619 			return i;
620 	}
621 
622 	return 0;
623 }
624 
625 
Replace(char inTarget,char inReplacement)626 long UtilStr::Replace( char inTarget, char inReplacement ) {
627 	unsigned long count, i, len = length();
628 
629 	count = 0;
630 
631 	for ( i = 1; i <= len; i++ ) {
632 		if ( mBuf[ i ] == inTarget ) {
633 			mBuf[ i ] = inReplacement;
634 			count++;
635 		}
636 	}
637 
638 	return count;
639 }
640 
641 
Replace(char * inTarget,char * inReplacement,bool inCaseSensitive)642 long UtilStr::Replace( char* inTarget, char* inReplacement, bool inCaseSensitive ) {
643 	char* srce;
644 	long count = 0, pos, prevPos = 0;
645 
646 	// Calc the len of the target
647 	long targLen = 0;
648 	while ( inTarget[ targLen ] )
649 		targLen++;
650 
651 	// See if there's at least one instance of the target in this string...
652 	pos = contains( inTarget, targLen, 0, inCaseSensitive );
653 	if ( pos ) {
654 
655 		// Make this the dest str
656 		UtilStr srceStr( this );
657 		srce = srceStr.getCStr();
658 		count = 0;
659 		Keep( pos - 1 );
660 		goto _resume;
661 
662 		while ( pos ) {
663 			Append( srce + prevPos, pos - prevPos - 1 );
664 _resume:	Append( inReplacement );
665 			count++;
666 			prevPos = pos + targLen - 1;
667 			pos = srceStr.contains( inTarget, targLen, prevPos, inCaseSensitive );
668 		}
669 
670 		Append( srce + prevPos, srceStr.length() - prevPos );
671 	}
672 
673 	return count;
674 }
675 
copyTo(unsigned char * pasDestPtr,unsigned char inBytesToCopy) const676 void UtilStr::copyTo( unsigned char* pasDestPtr, unsigned char inBytesToCopy ) const {
677 	unsigned long 	bytes = length() + 1;
678 
679 	if ( bytes > inBytesToCopy )
680 		bytes = inBytesToCopy;
681 
682 	getPasStr();			// refreshes len byte for pas str
683 
684 	if ( bytes > 255 )
685 		bytes = 255;
686 
687     Move( pasDestPtr, &mBuf[0], bytes );
688 }
689 
690 
691 
692 
copyTo(char * inDestPtr,unsigned long inBytesToCopy) const693 void UtilStr::copyTo( char* inDestPtr, unsigned long inBytesToCopy ) const {
694 	unsigned long bytes = length() + 1;
695 
696 	if ( bytes > inBytesToCopy )
697 		bytes = inBytesToCopy;
698 
699 	getCStr();			// refreshes NUL byte for c strs
700 
701 
702 	Move( inDestPtr, &mBuf[1], bytes );
703 }
704 
705 
706 
707 
708 
709 
710 
getChar(unsigned long i) const711 char UtilStr::getChar( unsigned long i ) const {
712 
713 	if ( i <= mStrLen && i > 0 )
714 		return mBuf[ i ];
715 	else
716 		return '\0';
717 }
718 
719 
720 
setChar(unsigned long i,char inChar)721 void UtilStr::setChar( unsigned long i, char inChar ) {
722 
723 
724 	if ( i <= mStrLen && i > 0 )
725 		mBuf[ i ] = inChar;
726 }
727 
728 
729 
730 
731 
StrCmp(const char * s1,const char * s2,long inN,bool inCaseSensitive)732 int UtilStr::StrCmp( const char* s1, const char* s2, long inN, bool inCaseSensitive ) {
733 	char c1, c2;
734 
735 	if ( inN < 0 ) {
736 		inN = 0;
737 		const char* s = (*s1 != 0) ? s1 : s2;
738 		while ( *s ) {
739 			s++;
740 			inN++;
741 		}
742 	}
743 
744 	while ( inN > 0 ) {
745 		inN--;
746 		c1 = *s1;
747 		s1++;
748 		c2 = *s2;
749 		s2++;
750 		if ( ! inCaseSensitive ) {
751 			if ( c1 >= 'a' && c1 <= 'z' )
752 				c1 -= 32;
753 			if ( c2 >= 'a' && c2 <= 'z' )
754 				c2 -= 32;
755 		}
756 		if ( c1 != c2 )
757 			return c1 - c2;
758 	}
759 
760 	return 0;
761 }
762 
763 
764 
765 
compareTo(const unsigned char * inPStr,bool inCaseSensitive) const766 int UtilStr::compareTo( const unsigned char* inPStr, bool inCaseSensitive ) const {
767 
768 	if ( inPStr ) {
769 		if ( length() == inPStr[0] ) {
770 			return StrCmp( getCStr(), (char*) (inPStr + 1), length(), inCaseSensitive );
771 		}
772 	}
773 
774 	return -1;
775 }
776 
777 
compareTo(const UtilStr * inStr,bool inCaseSensitive) const778 int UtilStr::compareTo( const UtilStr* inStr, bool inCaseSensitive ) const {
779 
780 	if ( inStr )
781 		return StrCmp( inStr -> getCStr(), getCStr(), length() + 1, inCaseSensitive );
782 	else
783 		return -1;
784 }
785 
786 
compareTo(const char * inStr,bool inCaseSensitive) const787 int UtilStr::compareTo( const char* inStr, bool inCaseSensitive ) const {
788 	if ( inStr )
789 		return StrCmp( inStr, getCStr(), length() + 1, inCaseSensitive );
790 	else
791 		return -1;
792 }
793 
794 
795 
contains(const char * inSrchStr,int inLen,int inStartingPos,bool inCaseSensitive) const796 long UtilStr::contains( const char* inSrchStr, int inLen, int inStartingPos, bool inCaseSensitive ) const {
797 	char	srchChar, srchCharLC, c;
798 	char*	endPtr, *curPtr = getCStr();
799 
800 	if ( inLen < 0 ) {
801 		inLen = 0;
802 		while ( *(inSrchStr+inLen) )
803 			inLen++;
804 	}
805 
806 	endPtr = curPtr + length() - inLen;
807 
808 	srchChar = *inSrchStr;
809 	if ( srchChar >= 'a' && srchChar <= 'z' )
810 		srchChar -= 32;
811 	srchCharLC	= srchChar + 32;
812 	if ( inStartingPos > 0 )
813 		curPtr += inStartingPos;
814 
815 	while ( curPtr <= endPtr ) {
816 		c = *curPtr;
817 		if ( c == srchChar || c == srchCharLC ) {
818 			if ( StrCmp( curPtr, inSrchStr, inLen, inCaseSensitive ) == 0 )
819 				return curPtr - getCStr() + 1;
820 		}
821 		curPtr++;
822 	}
823 
824 	return 0;
825 }
826 
827 
828 
829 /*
830 *** The following is some string silmilarity/matching theory...  man, i miss cornell cs...
831 
832 Dynamic Programming Algorithm for Sequence Alignment.
833 Dynamic Programming Algorithms are used for finding shortest paths in graphs, and in many other optimization problems, but in the comparison or alignment of strings (as in Biological DNA, RNA and protein sequence analysis, speech recognition and shape comparison) the following, or similar, is often called "the" dynamic programming algorithm (DPA).
834 
835 Generic Dynamic Programming Algorithm for Comparing Two Strings:
836 Given two strings or sequences A[1..|A|] and B[1..|B|]
837 
838 M[0, 0] = z                                    -- usually z=0
839 M[i, 0] = f( M[i-1, 0  ], c(A[i], "_" ) )      -- Boundary conditions - delete A[i]
840 M[0, j] = f( M[0,   j-1], c("_",  B[j]) )      -- Boundary conditions - delete B[j]
841 
842 M[i, j] = g( f( M[i-1, j-1], c(A[i], B[j]) ),  -- match/mismatch
843              f( M[i-1, j  ], c(A[i], "_" ) ),  -- delete A[i]
844              f( M[i,   j-1], c("_",  B[j]) ) ) -- insert B[j]
845 
846 Note that "_" represents the null (pseudo-)character.
847 
848 M[i,j] represents the cost (or score) of the partial sequences A[1..i] and B[1..j], and the algorithm requires that M[i,j] can be calculated from the three neighbours of M[i,j] - to the north (M[i,j-1]), west (M[i-1,j]), and north west (M[i-1,j-1]).
849 
850 c( x, y ):  Cost of replacing character x with character y
851 f( ):       String reconcatination function
852 g( ):       Substring score chooser function
853 
854 O( AB ) running time (Assming const running const running time for g, f and c)
855 O( A ) or O( B ) storage
856 
857 So what are z, f(), g(), and c()?  By varing them, the returned score comparison will follow different match criteria/behavior.  Here's 5 sets of assignments:
858 
859 1)  === Longest Common Subsequence (LCS or LCSS)
860 z = 0
861 g( ) = min( )
862 f( ) = +
863 c(x,x) = 0, c(x,y) = c(x,"_") = c("_",x) = 1
864 A big LCS score means the sequences are similar - lots of matches c(x,x).
865 Note that an optimal LCS sequence alignment can be recovered either by retracing the `max' choices that were made from M[|A|,|B|] to M[0,0], or by using Hirschberg's (1975) divide and conquer technique.
866 
867 2)  === Levenshtein Metric or Sellers' Edit Distance
868 A big edit distance value means the sequences are dissimilar - lots of changes c(x,y), and indels c(x,"_") and c("_",x).
869 
870 z = 0
871 g( ) = min( )
872 f( ) = +
873 c(x,x) = 0, c(x,y), c(x,"_"), c("_",x) > 0
874 The above assumes "simple" gap costs; linear but some other more complex gap costs can be incorporated with modifications.
875 
876 3)  === Probability of Alignments
877 For given probabilities, P(match), P(mismatch), P(insert) and P(delete), the following dynamic programming algorithm finds a most probable alignment of two given sequences:
878 
879 z = 1       -- NB.
880 g( ) = max( )
881 f( ) = *
882 c(x,x) = P(match) * P(x)
883 c(x,y) = P(mismatch) * P(x,y | x!=y)
884 c(x,"_") = P(delete) * P(x)
885 c("_",x) = P(insert) * P(x)
886 Unfortunately the quantities calculated become very small for strings of realistic lengths and underflow is the consequence. It is more convenient to deal with the -log's of probabilities and this also corresponds to a coding or information theory interpretation - see below.
887 
888 4)   === Minimum Message Length MML Optimal Alignment
889 aka Minimum Description Length MDL
890 
891 z = 0
892 g( ) = min( )
893 f( ) = +
894 c(x,x) = -log2(P(match)) - log2(P(x))
895 c(x,y) = -log2(P(mismatch)) - log2(P(x,y | x!=y))
896 c(x,"_") = -log2(P(delete)) - log2(P(x))
897 c("_",x) = -log2(P(insert) -log2(P(x))
898 - assuming "simple" gap costs; modify if not. Base-two logs are taken if you wish to interpret the quantities as bits; some prefer natural logarithms, giving nits.
899 An alignment can be thought of as a hypothesis of how strings A and B are related, and it can be compared with the null-theory that they are not related (Allison et al 1990(a), 1990(b), 1992). Thus, if an alignment gives some real data compression for the pair of strings, it is an acceptable hypothesis.
900 
901 5)   === Minimum Message Length R-Theory, Sum Over All Alignments
902 The r-theory (r for related) is the hypothesis that two strings are related in some unspecified way. Its (-log2) probability is calculated as above except that
903 
904 g(,) = logplus(,)
905 where logplus(-log2(P1), -log2(P2)) = -log2(P1+P2)
906 One alignment is just one hypothesis of how one string changed into the other. There may be many optimal alignments and many more suboptimal alignments. Two alignments are two different or exclusive hypotheses so their probabilities can be added.
907 
908 Thus the sum over all alignments gives the -log2 probability of the two strings being created in a related but unspecified way. The complement of this r-theory is that the strings are not related. Comparing the message lengths of the r-theory and the null-theory gives the posterior -log odds-ratio that the two strings are related (for a given model of string relation of course):
909 
910 r-theory:
911   P(A & B & related) = P(A & B).P(related | A & B)
912                      = P(related).P(A & B | related)
913   where P(A & B | related) = Sum[all L] P(A & B | alignment L)
914 
915 null-theory:
916   P(A & B & not related) = P(not related).P(A & B | not related)
917                          = P(not related).P(A).P(B)
918                          = P(A & B).P(not related | A & B)
919 
920 
921 We can put a 50:50 prior on being related or unrelated. We do not know the prior probability of the data, P(A&B), but we can cancel it out to get the posterior -log-odds ratio of the r-theory and the null-theory.
922 
923 It is fortunate that the sum of P(A&B|L) over all alignments L can be calculated in O(|A|*|B|) time for "finite-state" models of relation (based on finite-state machines) using the dynamic programming algorithm above and variations upon it (Allison et al 1990(a), 1990(b), 1992).
924 
925 References.
926 L. Allison, C. S. Wallace and C. N. Yee. When is a String like a String? AI & Maths 1990(a)
927 L. Allison, C. S. Wallace and C. N. Yee. Inductive inference over macro-molecules. TR 90/148 Department of Computer Science, Monash University, November 1990(b)
928 L. Allison, C. S. Wallace and C. N. Yee. Finite-State Models in the Alignment of Macromolecules. Jrnl. Molec. Evol. 35 77-89 1992
929 D. S. Hirschberg. A Linear Space Algorithm for Computing Maximal Common Subsequences. Comm. Assoc. Comp. Mach. 18(6) 341-343 1975
930 V. I. Levenshtein. Binary Codes Capable of Correcting Deletions, Insertions and Reversals. Soviet Physics Doklady 10(8) 707-710 1966 and Doklady Akademii Nauk SSSR 163(4) 845-848 1965
931 P. Sellers. On the Theory and Computation of Evolutionary Distances. SIAM J. Appl. Math 26(4) 787-793 1974
932 Lloyd Allison, Department of Computer Science, Monash University, Australia 3168
933 */
934 
935 // An implementaion of the Longest Common Subsequece function set for the general Sequence Alignment alg described above
936 #define __MIN( a, b ) ( ( (a) < (b) ) ? (a) : (b) )
937 #define STACK_TEMP_SIZE  30
938 #define COST_CONTIGUOUS			1
939 #define COST_OF_DEL				1 /* Cost of having to delete chars from this to match inStr when preceding char matches */
940 #define COST_OF_INS				16 /* Cost of having to insert chars to this to match inStr.  higher cost than DEL to penalize chars contained in inStr but not in this */
941 #define COST_OF_CHANGE 			17 /* Cose of changing a char in this to match inStr */
942 #define COST_OF_CASE			1 /* Cose of having to change the case of a char in this to match a char in inStr */
943 
LCSMatchScore(const char * inStr,long ALen) const944 long UtilStr::LCSMatchScore( const char* inStr, long ALen ) const {
945 	const char *A, *B;
946 	long*	M;
947 	long	temp[ STACK_TEMP_SIZE + 1 ];
948 	long	BLen, a, b, M_bm1, ins, del, mat_mis, c_a, c_b, c_bUC, prev_b_UC, cost;
949 
950 	// Calc the length if it wasn't given to us
951 	if ( ALen < 0 ) {
952 		ALen = 0;
953 		while ( *(inStr+ALen) )
954 			ALen++;
955 	}
956 
957 	/* Subtract 1 from the str address to account for the fact that all accesses
958 	are 1 based indexing */
959 	A = inStr - 1;
960 	B = getCStr() - 1;
961 	BLen = length();
962 
963 
964 	/* let A and B be the two strings to be compared.
965 	We need to make a 2D table, M[][], that has corner M[ |A| ][ |B| ].  If we evaluate M, row by row, we never
966 	need to access a row more further than one row.  This permits us to evaluate [ |A| ][ |B| ] with only having
967 	to allocate a single row's worth of elements (vs. allocating all |A| rows).  */
968 
969 	// Don't use stack-allocated temp mem if our string is too big...
970 	// Top most significant byte in M[][] contains info!
971 	if ( ALen < STACK_TEMP_SIZE )
972 		M = temp;
973 	else
974 		M = new long[ ALen + 1 ];  // +1 for the nul str column/representation
975 
976 	// Initialize the topmost row of M[][] ...
977 	M[ 0 ] = 0;
978 	for ( a = 1; a <= ALen; a++ )
979 		M[ a ] = M[ a - 1 ] + COST_OF_INS;
980 
981 	// Evaluate from row 2 to row BLen
982 	c_bUC = 0;
983 	for ( b = 1; b <= BLen; b++ ) {
984 		prev_b_UC = c_bUC;
985 		c_bUC = c_b = B[ b ];
986 		if ( c_bUC >= 'a' && c_bUC <= 'z' )
987 			c_bUC -= 32;
988 
989 		// Evaluate M[ 0, b ] via boundry conditions
990 		M_bm1 = M[ 0 ];
991 		M[ 0 ] += COST_OF_DEL;
992 
993 		// Evaluate from column 2 to column ALen
994 		for ( a = 1; a <= ALen; a++ ) {
995 
996 			// === Calc NW term...
997 			// Calc the cost of changing B[b] to A[a]...
998 			c_a = A[ a ];
999 			cost = 0;
1000 			if ( c_a != c_b ) {
1001 				if ( c_a >= 'a' && c_a <= 'z' )
1002 					c_a -= 32;
1003 
1004 				if ( c_a != c_bUC )
1005 					cost = COST_OF_CHANGE;
1006 				else
1007 					cost = COST_OF_CASE;
1008 			}
1009 			// NW Term:  M_bm1 is M[ a - 1, b - 1 ] (replace A[a] with B[b])
1010 			mat_mis = M_bm1 + cost;
1011 
1012 			// === Calc N term...
1013 			// Calc the cost of changing B[b] to nul.  Favor contiguity.
1014 			// Favor contiguity (case insensitive--any case match is 'contiguous')
1015 			cost = COST_OF_DEL;
1016 			if ( c_a == prev_b_UC )
1017 				cost += COST_CONTIGUOUS;
1018 
1019 			// N Term: M[ a ] is M[ a, b - 1 ] ( delete B[b] )
1020 			del = M[ a ] + cost;
1021 
1022 			// === Calc W term...
1023 			// W Term: M[ a - 1 ] is M[ a - 1, b ]  ( insert A[a] )
1024 			ins = M[ a - 1 ] + COST_OF_INS;
1025 
1026 			// Maintain M_bm1 for next iteration of a
1027 			M_bm1 =	M[ a ];
1028 
1029 			// Choose the lost cost direction:  M[ a, b ] = max( mat_mis, delAa, insBb )
1030 			M[ a ] = __MIN( del, ins );
1031 			M[ a ] = __MIN( M[ a ], mat_mis );
1032 		}
1033 	}
1034 
1035 	if ( ALen >= STACK_TEMP_SIZE )
1036 		delete []M;
1037 
1038 	return 100000 - M[ ALen ];
1039 }
1040 
1041 
1042 /*
1043 bool UtilStr::equalTo( const char* inStr, int inFlags ) const {
1044 	bool		stillOk 			= inStr != 0;
1045 	char*		thisStr 			= getCStr();
1046 	char		c1 = 1, c2			= 2;
1047 	bool		caseInsensitive 	= inFlags & cCaseInsensitive != 0;
1048 
1049 	while ( stillOk && c2 != 0 ) {
1050 		c1 = *inStr;
1051 		c2 = *thisStr;
1052 		if ( caseInsensitive ) {
1053 			if ( c1 >= 'a' && c1 <= 'z' )
1054 				c1 -= 32;
1055 			if ( c2 >= 'a' && c2 <= 'z' )
1056 				c2 -= 32;
1057 		}
1058 		stillOk = c1 == c2;
1059 		thisStr	+= 1;
1060 		inStr	+= 1;
1061 	}
1062 
1063 	if ( ( inFlags & cLefthand ) && c2 == 0 )
1064 		stillOk = true;
1065 
1066 	return stillOk;
1067 }
1068 */
1069 
1070 
1071 
1072 
1073 
1074 
1075 
1076 
1077 
1078 
1079 
GetIntValue(char * inStr,long inLen,long * outPlacePtr)1080 long UtilStr::GetIntValue( char* inStr, long inLen, long* outPlacePtr ) {
1081 	bool seenNum = false;
1082 	char c;
1083 	long i, ret = 0, place = 1;
1084 
1085 	for ( i = inLen - 1; i >= 0; i-- ) {
1086 		c = inStr[ i ];
1087 		if ( c >= '0' && c <= '9' ) {
1088 			seenNum = true;
1089 			ret += place * ( c - '0' );
1090 			place *= 10; }
1091 		else if ( seenNum )
1092 			i = 0;		// Stop loop
1093 	}
1094 
1095 	if ( outPlacePtr )
1096 		*outPlacePtr = place;
1097 
1098 	return ret;
1099 }
1100 
1101 
1102 
1103 
1104 
GetFloatVal(char * inStr,long inLen)1105 double UtilStr::GetFloatVal( char* inStr, long inLen ) {
1106 	unsigned long i, decLoc = 0, foundLet = false;
1107 	char c;
1108 	double n = 0, place = 1.0;
1109 	bool isNeg = false;
1110 
1111 
1112 	for ( i = 0; i < inLen; i++ ) {
1113 		c = inStr[ i ];
1114 
1115 		if ( c == '-' && ! foundLet )
1116 			isNeg = true;
1117 
1118 		if ( c >= '0' && c <= '9' ) {
1119 			n = 10.0 * n + ( c - '0' );
1120 			if ( decLoc )
1121 				place *= 10.0;
1122 		}
1123 
1124 		if ( c != ' ' )
1125 			foundLet = true;
1126 
1127 		if ( c == '.' )
1128 			decLoc = i+1;
1129 	}
1130 
1131 	if ( isNeg )
1132 		n = - n;
1133 
1134 	return n / place;
1135 }
1136 
1137 
1138 
1139 
GetFloatValue() const1140 double UtilStr::GetFloatValue() const {
1141 	return GetFloatVal( mBuf + 1, mStrLen );
1142 }
1143 
1144 
1145 
1146 
GetValue(long inMultiplier) const1147 long UtilStr::GetValue( long inMultiplier ) const {
1148 	unsigned long i, len = length(), decLoc = 0, foundLet = false;
1149 	char c;
1150 	long left, right, place;
1151 
1152 	for ( i = 1; i <= len; i++ ) {
1153 		c = mBuf[ i ];
1154 
1155 		if ( c == '-' && ! foundLet )
1156 			inMultiplier *= -1;
1157 
1158 		if ( c != ' ' )
1159 			foundLet = true;
1160 
1161 		if ( c == '.' )
1162 			decLoc = i;
1163 	}
1164 
1165 	if ( ! decLoc )
1166 		decLoc = len + 1;
1167 
1168 	left 	= GetIntValue( mBuf + 1, decLoc - 1 );
1169 	right 	= GetIntValue( mBuf + decLoc + 1, len - decLoc, &place );
1170 
1171 	return ( inMultiplier * right + place / 2 ) / place + inMultiplier * left;
1172 }
1173 
1174 
1175 
1176 
SetValue(long inVal,long inDivisor,int inNumDecPlaces)1177 void UtilStr::SetValue( long inVal, long inDivisor, int inNumDecPlaces ) {
1178 	long			i, part 	= inVal % inDivisor;
1179 	UtilStr		partStr;
1180 
1181 	for ( i = 0; i < inNumDecPlaces; i++ )
1182 		part *= 10;
1183 
1184 	part /= inDivisor;
1185 
1186 	i = inVal / inDivisor;
1187 	if ( i != 0 || part <= 0 )
1188 		Assign( i );
1189 	else
1190 		Wipe();
1191 
1192 	if ( part > 0 ) {
1193 		Append( '.' );
1194 		partStr.Append( part );
1195 
1196 		for ( i = inNumDecPlaces - partStr.length(); i > 0; i-- )
1197 			Append( '0' );
1198 
1199 		Append( &partStr );
1200 		while ( getChar( length() ) == '0' )
1201 			Trunc( 1 );
1202 	}
1203 
1204 }
1205 
1206 
1207 
SetFloatValue(float inValue,int inPercision)1208 void UtilStr::SetFloatValue( float inValue, int inPercision ) {
1209 	int deci_digits, left_digits = log10( fabs( inValue ) ) + 1.00001;
1210 	float scale;
1211 
1212 	if ( left_digits < 9 ) {
1213 		deci_digits = 10 - left_digits;
1214 		if ( deci_digits > inPercision )
1215 			deci_digits = inPercision;
1216 		scale =  pow( 10, deci_digits );
1217 		SetValue( inValue * scale, scale, deci_digits ); }
1218 	else
1219 		Assign( "Overflow" );
1220 }
1221 
1222 
1223 
1224 
1225 
1226 
1227 
1228 
1229 
1230 
1231 
1232 
1233 
1234 
AppendAsMeta(const UtilStr * inStr)1235 void UtilStr::AppendAsMeta( const UtilStr* inStr ) {
1236 
1237 	if ( inStr )
1238 		AppendAsMeta( inStr -> getCStr(), inStr -> length() );
1239 }
1240 
1241 
1242 
1243 
AppendAsMeta(const void * inPtr,long inLen)1244 void UtilStr::AppendAsMeta( const void* inPtr, long inLen ) {
1245 	const unsigned char* ptr = (unsigned char*) inPtr;
1246 	unsigned char c;
1247 
1248 	Append( '"' );
1249 
1250 	if ( ptr ) {
1251 		while ( inLen > 0 ) {
1252 			c = *ptr;
1253 
1254 			if ( c == '"' ) 						// Our flag was detected...
1255 				Append( (char) '"' );				// Two flags in a row signal an actual "
1256 
1257 			if ( c < 32 || c > 127 ) {
1258 				Append( (char) '"' );
1259 				Append( (long) c );
1260 				Append( (char) '"' ); }
1261 			else
1262 				Append( &c, 1 );
1263 			inLen--;
1264 			ptr++;
1265 		}
1266 	}
1267 	Append( '"' );
1268 }
1269 
1270 
1271 
1272 
AppendFromMeta(const void * inPtr,long inLen)1273 void UtilStr::AppendFromMeta( const void* inPtr, long inLen ) {
1274 	const unsigned char* ptr = (unsigned char*) inPtr;
1275 	unsigned char c;
1276 	// FIXME:  ascNum was declared static but that caused xmms to
1277 	// segfault on exit.  I'm not sure if declaring this static is
1278 	// a good idea, but the point is there may be a deeper problem.
1279 	UtilStr ascNum;
1280 
1281 	if ( ptr ) {
1282 		if ( *ptr != '"' )							// First char should be a "
1283 			return;
1284 		inLen--;
1285 		ptr++;
1286 
1287 		while ( inLen > 1 ) {						// We stop 1 char early cuz last char should be a "
1288 			c = *ptr;
1289 
1290 			if ( c == '"' ) {						// If the flag is detected...
1291 				inLen--;
1292 				ptr++;
1293 				c = *ptr;
1294 				if ( inLen > 1 && c != '"' ) {		// Ignore double flags (they signify the flag char)
1295 					ascNum.Wipe();
1296 					while ( c >= '0' && c <= '9' ) {
1297 						ascNum.Append( (char) c );
1298 						inLen--;
1299 						ptr++;
1300 						c = *ptr;
1301 					}
1302 					c = ascNum.GetValue();
1303 				}
1304 			}
1305 
1306 			Append( (char) c );
1307 
1308 			inLen--;
1309 			ptr++;
1310 		}
1311 	}
1312 }
1313 
1314 
1315 
1316 
1317 
1318 
1319 
operator +(const UtilStr & inStr)1320 UtilStr UtilStr::operator + ( const UtilStr& inStr ) {
1321 	UtilStr	newStr( this );
1322 
1323 	newStr.Append( &inStr );
1324 	return newStr;
1325 }
1326 
1327 
1328 
operator +(const char * inCStr)1329 UtilStr UtilStr::operator + ( const char*  inCStr ) {
1330 	UtilStr	newStr( this );
1331 
1332 	newStr.Append( inCStr );
1333 	return newStr;
1334 }
1335 
1336 
1337 
operator +(long inNum)1338 UtilStr UtilStr::operator + ( long inNum ) {
1339 	UtilStr	newStr( this );
1340 
1341 	newStr.Append( inNum );
1342 	return newStr;
1343 }
1344 
1345 
1346 
1347 
operator =(const UtilStr & inStr)1348 UtilStr& UtilStr::operator = ( const UtilStr& inStr ) {
1349 	Wipe();
1350 
1351 	Append( inStr.getCStr() );
1352 	return *this;
1353 }
1354 
1355 
1356 
1357 
1358 
1359 
1360 
Hash() const1361 long UtilStr::Hash() const {
1362 	long hash = 0;
1363 	const char* stop	= getCStr();
1364 	const char* curPos	= stop + mStrLen - 1;
1365 
1366 	if ( mStrLen < 16 ) {
1367 		// Sample all the characters
1368  	    while ( curPos >= stop ) {
1369  			hash = ( hash * 37 ) + *curPos;
1370  			curPos--;
1371  		} }
1372  	else {
1373  	    // only sample some characters
1374  	    int skip = mStrLen / 7;
1375  	    while ( curPos >= stop ) {
1376  			hash = ( hash * 39 ) + *curPos;
1377  			curPos -= skip;
1378  		}
1379  	}
1380 
1381  	return hash;
1382 }
1383 
1384 
1385 
Equals(const Hashable * inComp) const1386 bool UtilStr::Equals( const Hashable* inComp ) const {
1387 
1388 	return compareTo( (UtilStr*) inComp ) == 0;
1389 }
1390 
1391 
1392 
1393 
1394 
AppendHex(char inB1,char inB2)1395 void UtilStr::AppendHex( char inB1, char inB2 ) {
1396 	unsigned char c;
1397 
1398 	if ( inB1 >= '0' && inB1 <= '9' )
1399 		inB1 -= '0';
1400 	else
1401 		inB1 = 9 + inB1 & 0xF;
1402 	c = inB1 << 4;
1403 
1404 	if ( inB2 >= '0' && inB2 <= '9' )
1405 		c += (inB2 - '0');
1406 	else
1407 		c += 9 + inB2 & 0xF;
1408 
1409 
1410 	Append( (char) c );
1411 }
1412 
1413 
1414 
1415 
1416 /*
1417 
1418 
1419 
1420 long UtilStr::Hash( const char* inStr, long inStrLen ) {
1421 	long hash = 0;
1422 	const char* curPos = inStr + inStrLen - 1;
1423 
1424 	if ( inStrLen < 0 )
1425 		inStrLen = inStr's len
1426 
1427 	if ( inStrLen < 16 ) {
1428 		// Sample all the characters
1429  	    while ( curPos >= inStr ) {
1430  			hash = ( hash * 37 ) + *curPos;
1431  			curPos--;
1432  		} }
1433  	else {
1434  	    // only sample some characters
1435  	    int skip = inStrLen / 7;
1436  	    while ( curPos >= inStr ) {
1437  			hash = ( hash * 39 ) + *curPos;
1438  			curPos -= skip;
1439  		}
1440  	}
1441 
1442  	return hash;
1443 }(*/
1444 
1445 
1446 
1447 
1448