1 /* -*- mode: C++; tab-width: 4; c-basic-offset: 4; -*- */
2 
3 // UT_Stringbuf.cpp
4 
5 // Copyright (C) 2001 Mike Nordell <tamlin@algonet.se>
6 // Copyright (c) 2007 Hubert Figuiere <hub@figuiere.net>
7 //
8 // This class is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU General Public License
10 // as published by the Free Software Foundation; either version 2
11 // of the License, or (at your option) any later version.
12 //
13 // This class is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 // GNU General Public License for more details.
17 //
18 // You should have received a copy of the GNU General Public License
19 // along with this program; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 // 02110-1301 USA.
22 //
23 #include <stdlib.h>
24 #include <ctype.h>
25 #include <stdio.h>
26 #include <algorithm>
27 
28 #include <libxml/uri.h>
29 
30 #include <glib.h>
31 
32 #include "ut_string.h"
33 #include "ut_stringbuf.h"
34 #include "ut_unicode.h"
35 #include "ut_string_class.h"
36 #include "ut_assert.h"
37 #include "ut_debugmsg.h"
38 
39 // these classes keep zero terminated strings.
40 // if size() != 0, capacity() is always at least size() + 1.
41 
42 //////////////////////////////////////////////////////////////////
43 
44 
45 
46 
47 
48 ////////////////////////////////////////////////////////////////////////
49 //
50 //  UTF-8 string: encoding is *always* UTF-8
51 //
52 ////////////////////////////////////////////////////////////////////////
53 
54 
UT_UTF8Stringbuf()55 UT_UTF8Stringbuf::UT_UTF8Stringbuf () :
56 	m_psz(0),
57 	m_pEnd(0),
58 	m_strlen(0),
59 	m_buflen(0)
60 {
61 	//
62 }
63 
UT_UTF8Stringbuf(const UT_UTF8Stringbuf & rhs)64 UT_UTF8Stringbuf::UT_UTF8Stringbuf (const UT_UTF8Stringbuf & rhs) :
65 	m_psz(0),
66 	m_pEnd(0),
67 	m_strlen(0),
68 	m_buflen(0)
69 {
70 	append (rhs);
71 }
72 
UT_UTF8Stringbuf(const char * sz,size_t n)73 UT_UTF8Stringbuf::UT_UTF8Stringbuf (const char * sz, size_t n /* == 0 => null-termination */) :
74 	m_psz(0),
75 	m_pEnd(0),
76 	m_strlen(0),
77 	m_buflen(0)
78 {
79 	append (sz, n);
80 }
81 
~UT_UTF8Stringbuf()82 UT_UTF8Stringbuf::~UT_UTF8Stringbuf ()
83 {
84 	clear ();
85 }
86 
operator =(const UT_UTF8Stringbuf & rhs)87 void UT_UTF8Stringbuf::operator=(const UT_UTF8Stringbuf & rhs)
88 {
89 	m_pEnd = m_psz;
90 	m_strlen = 0;
91 	append (rhs);
92 }
93 
assign(const char * sz,size_t n)94 void UT_UTF8Stringbuf::assign (const char * sz, size_t n /* == 0 => null-termination */)
95 {
96 	m_pEnd = m_psz;
97 	m_strlen = 0;
98 	append (sz, n);
99 }
100 
101 // returns 0 if invalid, or if end of string, i.e. 0
102 // technically it could differentiate, since UCS-4 is only 31-bit, but...
charCode(const char * str)103 UT_UTF8Stringbuf::UCS4Char UT_UTF8Stringbuf::charCode (const char * str)
104 {
105 	if ( str == 0) return 0;
106 	if (*str == 0) return 0;
107 
108 	const char * p = str;
109 
110 	if ((*p & 0x80) == 0x00) // plain us-ascii part of latin-1
111 	{
112 		return (UCS4Char) (*p);
113 	}
114 
115 	UCS4Char ret_code = 0;
116 
117 	int bytesInSequence = 0;
118 	int bytesExpectedInSequence = 0;
119 
120 	while (*p)
121 	{
122 		// 'continuing' octets:
123 		if ((*p & 0xc0) == 0x80) // trailing byte in multi-byte sequence
124 		{
125 			if (bytesInSequence == 0) break;
126 			bytesInSequence++;
127 
128 			ret_code = (ret_code << 6) | (UCS4Char) (*p & 0x3f);
129 
130 			if (bytesInSequence == bytesExpectedInSequence) break;
131 
132 			p++;
133 			continue;
134 		}
135 
136 		if (bytesInSequence) break;
137 		bytesInSequence++;
138 
139 		/* 4,5,6-byte sequences may require > 2 bytes in UCS-4
140 		 */
141 		if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence
142 		{
143 			bytesExpectedInSequence = 6;
144 			ret_code = (UCS4Char) (*p & 0x01);
145 			p++;
146 			continue;
147 		}
148 		if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence
149 		{
150 			bytesExpectedInSequence = 5;
151 			ret_code = (UCS4Char) (*p & 0x03);
152 			p++;
153 			continue;
154 		}
155 		if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence
156 		{
157 			bytesExpectedInSequence = 4;
158 			ret_code = (UCS4Char) (*p & 0x07);
159 			p++;
160 			continue;
161 		}
162 
163 		/* 1,2,3-byte sequences do not require > 2 bytes in UCS-4
164 		 */
165 		if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence
166 		{
167 			bytesExpectedInSequence = 3;
168 			ret_code = (UCS4Char) (*p & 0x0f);
169 			p++;
170 			continue;
171 		}
172 		if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence
173 		{
174 			bytesExpectedInSequence = 2;
175 			ret_code = (UCS4Char) (*p & 0x1f);
176 			p++;
177 			continue;
178 		}
179 
180 		ret_code = 0;
181 		break; // invalid byte - not UTF-8
182 	}
183 	if (bytesInSequence != bytesExpectedInSequence) ret_code = 0;
184 
185 	return ret_code;
186 }
187 
append(const char * sz,size_t n)188 void UT_UTF8Stringbuf::append (const char * sz, size_t n /* == 0 => null-termination */)
189 {
190 	if (sz == 0)
191 		return;
192 	if (!grow ((n?n:strlen(sz)) + 1))
193 		return;
194 
195 	const char * p = sz;
196 	char buf[6];
197 	int bytesInSequence = 0;
198 	int bytesExpectedInSequence = 0;
199 	size_t np = 0;
200 
201 	while ((!n && *p) || (np < n))
202 	{
203 		if ((*p & 0x80) == 0x00) // plain us-ascii part of latin-1
204 		{
205 			if (bytesInSequence) break;
206 
207 			*m_pEnd++ = *p;
208 			*m_pEnd = 0;
209 			m_strlen++;
210 
211 			p++;
212 			np++;
213 			continue;
214 		}
215 
216 		// 'continuing' octets:
217 		if ((*p & 0xc0) == 0x80) // trailing byte in multi-byte sequence
218 		{
219 			if (bytesInSequence == 0) break;
220 
221 			buf[bytesInSequence++] = *p;
222 			if (bytesInSequence == bytesExpectedInSequence)
223 			{
224 				for (int b = 0; b < bytesInSequence; b++) *m_pEnd++ = buf[b];
225 				*m_pEnd = 0;
226 				m_strlen++;
227 				bytesInSequence = 0;
228 				bytesExpectedInSequence = 0;
229 			}
230 
231 			p++;
232 			np++;
233 			continue;
234 		}
235 
236 		if (bytesInSequence) break;
237 
238 		buf[bytesInSequence++] = *p;
239 
240 		/* 4,5,6-byte sequences may require > 2 bytes in UCS-4
241 		 */
242 		if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence
243 		{
244 			bytesExpectedInSequence = 6;
245 			p++;
246 			np++;
247 			continue;
248 		}
249 		if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence
250 		{
251 			bytesExpectedInSequence = 5;
252 			p++;
253 			np++;
254 			continue;
255 		}
256 		if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence
257 		{
258 			bytesExpectedInSequence = 4;
259 			p++;
260 			np++;
261 			continue;
262 		}
263 
264 		/* 1,2,3-byte sequences do not require > 2 bytes in UCS-4
265 		 */
266 		if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence
267 		{
268 			bytesExpectedInSequence = 3;
269 			p++;
270 			np++;
271 			continue;
272 		}
273 		if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence
274 		{
275 			bytesExpectedInSequence = 2;
276 			p++;
277 			np++;
278 			continue;
279 		}
280 
281 		break; // invalid byte - not UTF-8
282 	}
283 }
284 
append(const UT_UTF8Stringbuf & rhs)285 void UT_UTF8Stringbuf::append (const UT_UTF8Stringbuf & rhs)
286 {
287 	if (grow (rhs.byteLength () + 1))
288 	{
289 		memcpy (m_pEnd, rhs.data (), rhs.byteLength ());
290 		m_strlen += rhs.utf8Length ();
291 		m_pEnd = m_pEnd + rhs.byteLength ();
292 		*m_pEnd = 0;
293 	}
294 }
295 
appendUCS4(const UT_UCS4Char * sz,size_t n)296 void UT_UTF8Stringbuf::appendUCS4 (const UT_UCS4Char * sz, size_t n /* == 0 => null-termination */)
297 {
298 	size_t bytelength = 0;
299 	size_t i;
300 
301 	if (!sz || (!n && !*sz))
302 		return;
303 
304 	/* The vast majority of calls to appendUCS4 pass in
305 	   1 for n, so we can halve the number of calls to g_unichar_to_utf8
306 	   (in most cases) by caching the first byte length. */
307 	int iCache = 0;
308 
309 	for (i = 0; (i < n) || (n == 0); i++)
310 	{
311 		if((0 == sz[i]) && (0 == n))
312 			break;
313 		int seql = UT_Unicode::UTF8_ByteLength (sz[i]);
314 		if(i == 0)
315 			iCache = seql;
316 
317 		if (seql < 0)
318 			continue; // not UCS-4 !!
319 		if (seql == 0)
320 			break; // end-of-string?
321 		bytelength += static_cast<size_t>(seql);
322 	}
323 	if(bytelength == 0)
324 		return;
325 	if (!grow (bytelength + 1)) return;
326 
327 	for (i = 0; (i < n) || (n == 0); i++)
328 	{
329 		if((0 == sz[i]) && (0 == n))
330 			break;
331 		int seql;
332 		if(i == 0)
333 			seql = iCache;
334 		else
335 			seql = UT_Unicode::UTF8_ByteLength (sz[i]);
336 
337 		if (seql < 0)
338 			continue; // not UCS-4 !!
339 		if (seql == 0)
340 			break; // end-of-string?
341 		UT_Unicode::UCS4_to_UTF8 (m_pEnd, bytelength, sz[i]);
342 		m_strlen++;
343 	}
344 	*m_pEnd = 0;
345 }
346 
appendUCS2(const UT_UCS2Char * sz,size_t n)347 void UT_UTF8Stringbuf::appendUCS2 (const UT_UCS2Char * sz, size_t n /* == 0 => null-termination */)
348 {
349 	size_t bytelength = 0;
350 	size_t i;
351 	for (i = 0; (i < n) || (n == 0); i++)
352 	{
353 		if (sz[i]==0 && n==0) break;
354 		int seql = UT_Unicode::UTF8_ByteLength ((UT_UCS4Char)sz[i]);
355 		if (seql < 0)
356 			continue; // not UCS-4 !!
357 		if (seql == 0)
358 			break; // end-of-string?
359 		bytelength += static_cast<size_t>(seql);
360 	}
361 
362 	if (!grow (bytelength + 1)) return;
363 
364 	for (i = 0; (i < n) || (n == 0); i++)
365 	{
366 		if (sz[i]==0 && n==0) break;
367 		int seql = UT_Unicode::UTF8_ByteLength ((UT_UCS4Char)sz[i]);
368 		if (seql < 0)
369 			continue; // not UCS-4 !!
370 		if (seql == 0)
371 			break; // end-of-string?
372 		UT_Unicode::UCS4_to_UTF8 (m_pEnd, bytelength, (UT_UCS4Char)sz[i]);
373 		m_strlen++;
374 	}
375 	*m_pEnd = 0;
376 }
377 
378 /* replaces <str1> with <str2> in the current string
379  */
escape(const UT_UTF8String & utf8_str1,const UT_UTF8String & utf8_str2)380 void UT_UTF8Stringbuf::escape (const UT_UTF8String & utf8_str1,
381 							   const UT_UTF8String & utf8_str2)
382 {
383 	size_t diff = 0;
384 	size_t len1 = utf8_str1.byteLength ();
385 	size_t len2 = utf8_str2.byteLength ();
386 
387 	const char * str1 = utf8_str1.utf8_str ();
388 	const char * str2 = utf8_str2.utf8_str ();
389 
390 	if (len2 > len1)
391 	{
392 		diff = len2 - len1;
393 
394 		size_t incr = 0;
395 
396 		char * ptr = m_psz;
397 		while (ptr + len1 <= m_pEnd)
398 		{
399 			if (memcmp (ptr, str1, len1) == 0)
400 			{
401 				incr += diff;
402 				ptr += len1;
403 			}
404 			else
405 			{
406 				++ptr;
407 			}
408 		}
409 		if (!grow (incr)) return;
410 	}
411 	else
412 	{
413 		diff = len1 - len2;
414 	}
415 
416 	char * ptr = m_psz;
417 	while (ptr + len1 <= m_pEnd)
418 	{
419 		if (memcmp (ptr, str1, len1) == 0)
420 		{
421 			if (diff)
422 			{
423 				if (len2 > len1)
424 				{
425 					memmove (ptr + diff, ptr, m_pEnd - ptr + 1);
426 					m_pEnd += diff;
427 				}
428 				else
429 				{
430 					memmove (ptr, ptr + diff, m_pEnd - (ptr + diff) + 1);
431 					m_pEnd -= diff;
432 				}
433 			}
434 			memcpy (ptr, str2, len2);
435 			ptr += len2;
436 			m_strlen += utf8_str2.length () - utf8_str1.length ();
437 		}
438 		else
439 		{
440 			++ptr;
441 		}
442 	}
443 }
444 
445 /* FIXME -- these functions assume that &, <, > and " cannot appear in
446  *          multi-byte utf8 sequence -- I do not think that holds
447  *
448  *          Also, the decode function should handle other & tokens
449  *
450  *          Should use glib to traverse these strings
451  */
decodeXML()452 void UT_UTF8Stringbuf::decodeXML ()
453 {
454 	if (!m_psz)
455 		return;
456 
457 	size_t shrink = 0;
458 	char * p_src = m_psz;
459 	char * p_dst = m_psz;
460 
461 	while (p_src < m_pEnd && *p_src)
462 	{
463 		if(*p_src == '&')
464 		{
465 			if (!strncmp (p_src+1, "amp;", 4))
466 			{
467 				*p_dst++ = '&';
468 				p_src += 5;
469 				shrink += 4;
470 				continue;
471 			}
472 			else if (!strncmp (p_src+1, "lt;", 3))
473 			{
474 				*p_dst++ = '<';
475 				p_src += 4;
476 				shrink += 3;
477 				continue;
478 			}
479 			else if (!strncmp (p_src+1, "gt;", 3))
480 			{
481 				*p_dst++ = '>';
482 				p_src += 4;
483 				shrink += 3;
484 				continue;
485 			}
486 			else if (!strncmp (p_src+1, "quot;", 5))
487 			{
488 				*p_dst++ = '"';
489 				p_src += 6;
490 				shrink += 5;
491 				continue;
492 			}
493 		}
494 
495 		*p_dst = *p_src;
496 
497 		p_dst++;
498 		p_src++;
499 	}
500 
501 	*p_dst = 0;
502 	m_pEnd -= shrink;
503 }
504 
505 /* escapes '<', '>', '\"' and '&' in the current string
506  */
escapeXML()507 void UT_UTF8Stringbuf::escapeXML ()
508 {
509 	size_t incr = 0;
510 
511 	char * ptr = m_psz;
512 	while (ptr < m_pEnd)
513 		{
514 			if ((*ptr == '<') || (*ptr == '>')) incr += 3;
515 			else if (*ptr == '&') incr += 4;
516 			else if (*ptr == '"') incr += 5;
517 			ptr++;
518 		}
519 	bool bInsert = grow (incr);
520 
521 	ptr = m_psz;
522 	while (ptr < m_pEnd)
523 		{
524 			if (*ptr == '<')
525 				{
526 					if (bInsert)
527 						{
528 							*ptr++ = '&';
529 							insert (ptr, "lt;", 3);
530 						}
531 					else *ptr++ = '?';
532 				}
533 			else if (*ptr == '>')
534 				{
535 					if (bInsert)
536 						{
537 							*ptr++ = '&';
538 							insert (ptr, "gt;", 3);
539 						}
540 					else *ptr++ = '?';
541 				}
542 			else if (*ptr == '&')
543 				{
544 					if (bInsert)
545 						{
546 							*ptr++ = '&';
547 							insert (ptr, "amp;", 4);
548 						}
549 					else *ptr++ = '?';
550 				}
551 			else if (*ptr == '"')
552 				{
553 					if (bInsert)
554 						{
555 							*ptr++ = '&';
556 							insert (ptr, "quot;", 5);
557 						}
558 					else *ptr++ = '?';
559 				}
560 			else ptr++;
561 		}
562 }
563 
564 /*
565    this function escapes the string to provide for conformity with
566    http://www.w3.org/TR/xlink/#link-locators, section 5.4
567 
568    Just use libxml and hope for the best.
569 */
escapeURL()570 void UT_UTF8Stringbuf::escapeURL ()
571 {
572 	if(!m_psz || !*m_psz)
573 		return;
574 
575 	xmlChar * uri = xmlURIEscape(BAD_CAST m_psz);
576 	if(uri) {
577 		assign((gchar*)uri);
578 		xmlFree(uri);
579 	}
580 }
581 
582 /* decode %xx encoded characters
583  */
584 
s_charCode_to_hexval(UT_UCS4Char c)585 static UT_uint32 s_charCode_to_hexval(UT_UCS4Char c)
586 {
587 	if(c >= 0x30 && c <= 0x39)
588 		return c - 0x30;
589 	else if(c >= 0x41 && c <= 0x46)
590 		return c - 0x41 + 10;
591 	else if(c >= 0x61 && c <= 0x66)
592 		return c - 0x61 + 10;
593 
594 	UT_return_val_if_fail( UT_SHOULD_NOT_HAPPEN, 0 );
595 }
596 
decodeURL()597 void UT_UTF8Stringbuf::decodeURL()
598 {
599 	if(!m_psz || !*m_psz)
600 		return;
601 
602 	char * buff = (char*)g_try_malloc(byteLength() + 1);
603 	UT_return_if_fail( buff );
604 	buff[0] = 0;
605 
606 	UTF8Iterator J(this);
607 	const char * ptr = J.current();
608 	UT_UCS4Char c = charCode(J.current());
609 
610 	char utf8cache[7]; utf8cache[6] = 0;
611 	UT_uint32 iCachePos = 0;
612 	UT_uint32 iCacheNeeded = 0;
613 
614 
615 	while (c != 0)
616 	{
617 		if(c == '%')
618 		{
619 			J.advance();
620 			UT_UCS4Char b1 = charCode(J.current());
621 			J.advance();
622 			UT_UCS4Char b2 = charCode(J.current());
623 			J.advance();
624 
625 			if(isalnum(b1) && isalnum(b2))
626 			{
627 				b1 = s_charCode_to_hexval(b1);
628 				b2 = s_charCode_to_hexval(b2);
629 
630 				UT_UCS4Char code = ((b1 << 4)& 0xf0) | (b2 & 0x0f);
631 
632 				if(iCacheNeeded == 0)
633 				{
634 					// we start new utf8 sequence in the cache
635 					if ((code & 0x80) == 0)         iCacheNeeded = 1;
636 					else if ((code & 0xe0) == 0xc0) iCacheNeeded = 2;
637 					else if ((code & 0xf0) == 0xe0) iCacheNeeded = 3;
638 					else if ((code & 0xf8) == 0xf0) iCacheNeeded = 4;
639 					else if ((code & 0xfc) == 0xf8) iCacheNeeded = 5;
640 					else if ((code & 0xfe) == 0xfc) iCacheNeeded = 6;
641 
642 					utf8cache[0] = (char) code;
643 					utf8cache[iCacheNeeded] = 0; // make sure the sequence will be terminated
644 					iCachePos++;
645 				}
646 				else
647 				{
648 					// append to our cache
649 					utf8cache[iCachePos++] = (char) code;
650 				}
651 
652 				if(iCacheNeeded == 0 && (code >= 0x7f && code <= 0xff))
653 				{
654 					// the present character is not a valid start of utf8 sequence --
655 					// this is almost certainly a character from the extended ASCII set
656 					// which was encoded directly according to the RFC 1738 scheme, we
657 					// just append it
658 
659 					size_t iLenBuff = strlen(buff);
660 					size_t iLenLeft = byteLength() - iLenBuff;
661 
662 					char * p = buff + iLenBuff;
663 					UT_Unicode::UCS4_to_UTF8(p, iLenLeft, code);
664 
665 					// we need to null-terminate
666 					*p = 0;
667 				}
668 
669 				if(iCacheNeeded && iCacheNeeded <= iCachePos)
670 				{
671 					UT_ASSERT_HARMLESS( iCacheNeeded == iCachePos );
672 
673 					// append the cache to our buffer
674 					UT_uint32 iLenBuff = strlen(buff);
675 					char * p = buff + iLenBuff;
676 					strcat(p, utf8cache);
677 
678 					iCacheNeeded = iCachePos = 0;
679 				}
680 			}
681 			else
682 			{
683 				// this should not happen in encoded url and so we will ignore this token;
684 				// if we are in the middle of utf8 sequence; we will reset it
685 				iCacheNeeded = iCachePos = 0;
686 			}
687 		}
688 		else
689 		{
690 			J.advance(); // advance here, for the sake of the else clause below
691 
692 			if(iCacheNeeded > iCachePos)
693 			{
694 				// we are processing a utf sequence, so just append this byte to our cache
695 				utf8cache[iCachePos++] = (char) c;
696 			}
697 			else
698 			{
699 				const char * p = J.current();
700 				UT_uint32 iLen = p ? p - ptr : strlen(ptr);
701 				strncat(buff, ptr, iLen);
702 			}
703 		}
704 
705 		ptr = J.current();
706 		c = charCode(J.current());
707 	}
708 
709 	assign(buff);
710 	g_free(buff);
711 }
712 
713 /* translates the current string to MIME "quoted-printable" format
714  */
escapeMIME()715 void UT_UTF8Stringbuf::escapeMIME ()
716 {
717 	static const char hex[16] = { '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' };
718 	static const char * s_eol = "=\r\n";
719 
720 	if (m_strlen == 0) return;
721 
722 	size_t bytes = 0;
723 	char * ptr = m_psz;
724 	while (*ptr)
725 		{
726 			char c = *ptr++;
727 			unsigned char u = static_cast<unsigned char>(c);
728 
729 			if ((c == '\r') || (c == '\n') || (c == '=') || (u & 0x80)) bytes += 2;
730 		}
731 	if (bytes)
732 		{
733 			if (!grow (bytes)) return;
734 
735 			char * pOld = m_pEnd;
736 			char * pNew = m_pEnd + bytes;
737 
738 			while (pOld >= m_psz)
739 				{
740 					char c = *pOld--;
741 					unsigned char u = static_cast<unsigned char>(c);
742 
743 					if ((u & 0x80) || (c == '\r') || (c == '\n') || (c == '='))
744 						{
745 							*pNew-- = hex[ u       & 0x0f];
746 							*pNew-- = hex[(u >> 4) & 0x0f];
747 							*pNew-- = '=';
748 						}
749 					else *pNew-- = c;
750 				}
751 			m_pEnd += bytes;
752 			m_strlen = m_pEnd - m_psz;
753 		}
754 
755 	size_t length = 0;
756 	ptr = m_psz;
757 	while (true)
758 		{
759 			if (*ptr == 0)
760 				{
761 					if (length)
762 						{
763 							size_t offset = ptr - m_psz;
764 							if (grow (3))
765 								{
766 									ptr = m_psz + offset;
767 									insert (ptr, s_eol, 3);
768 								}
769 						}
770 					break;
771 				}
772 			if (length >= 70)
773 				{
774 					size_t offset = ptr - m_psz;
775 					if (grow (3))
776 						{
777 							ptr = m_psz + offset;
778 							insert (ptr, s_eol, 3);
779 						}
780 					length = 0;
781 				}
782 
783 			if (*ptr == '=')
784 				{
785 					ptr += 3;
786 					length += 3;
787 				}
788 			else
789 				{
790 					ptr++;
791 					length++;
792 				}
793 		}
794 }
795 
lowerCase()796 UT_UTF8Stringbuf * UT_UTF8Stringbuf::lowerCase ()
797 {
798 	if(!byteLength())
799 		return NULL;
800 
801 	UT_UTF8Stringbuf * n = new UT_UTF8Stringbuf();
802 	UT_return_val_if_fail(n, NULL);
803 
804 	UTF8Iterator s(this);
805 	UT_UCS4Char c = charCode(s.current());
806 
807 	while(c)
808 	{
809 		UT_UCS4Char l = UT_UCS4_tolower(c);
810 		n->appendUCS4(&l,1);
811 		c = charCode(s.advance());
812 	}
813 
814 	return n;
815 }
816 
clear()817 void UT_UTF8Stringbuf::clear ()
818 {
819 	if (m_psz) g_free (m_psz);
820 	m_psz = 0;
821 	m_pEnd = 0;
822 	m_strlen = 0;
823 	m_buflen = 0;
824 }
825 
insert(char * & ptr,const char * str,size_t utf8length)826 void UT_UTF8Stringbuf::insert (char *& ptr, const char * str, size_t utf8length)
827 {
828 	if ( str == 0) return;
829 	if (*str == 0) return;
830 
831 	if ((ptr < m_psz) || (ptr > m_pEnd)) return;
832 
833 	char * orig_buf = m_psz;
834 	char * orig_ptr = ptr;
835 
836 	size_t length = static_cast<size_t>(strlen(str));
837 
838 	if (!grow (length)) return;
839 
840 	ptr = m_psz + (orig_ptr - orig_buf);
841 
842 	memmove (ptr + length, ptr, (m_pEnd - ptr) + 1);
843 	memcpy (ptr, str, length);
844 
845 	ptr += length;
846 	m_pEnd += length;
847 	m_strlen += utf8length;
848 }
849 
reserve(size_t n)850 void UT_UTF8Stringbuf::reserve(size_t n)
851 {
852 	grow(n);
853 }
854 
grow(size_t length)855 bool UT_UTF8Stringbuf::grow (size_t length)
856 {
857 	if (length + 1 <= (m_buflen - (m_pEnd - m_psz))) return true;
858 
859 	if (m_psz == 0)
860 	{
861 		if (length == 0) return true;
862 		m_psz = static_cast<char *>(g_try_malloc(length));
863 		if (m_psz == 0) return false;
864 		m_strlen = 0;
865 		m_buflen = length;
866 		m_pEnd = m_psz;
867 		*m_pEnd = 0;
868 		return true;
869 	}
870 
871 	size_t new_length = length + (m_pEnd - m_psz) + 1;
872 	size_t end_offset = m_pEnd - m_psz;
873 
874 	char * more = static_cast<char *>(g_try_realloc(static_cast<void *>(m_psz), new_length));
875 	if (more == 0) return false;
876 	m_psz = more;
877 	m_pEnd = m_psz + end_offset;
878 	m_buflen = new_length;
879 	return true;
880 }
881 
UTF8Iterator(const UT_UTF8Stringbuf * strbuf)882 UT_UTF8Stringbuf::UTF8Iterator::UTF8Iterator (const UT_UTF8Stringbuf * strbuf) :
883 	m_strbuf(strbuf),
884 	m_utfbuf(0),
885 	m_utfptr(0)
886 {
887 	sync ();
888 }
889 
~UTF8Iterator()890 UT_UTF8Stringbuf::UTF8Iterator::~UTF8Iterator ()
891 {
892 	//
893 }
894 
operator =(const char * position)895 void UT_UTF8Stringbuf::UTF8Iterator::operator=(const char * position)
896 {
897 	if (!sync ()) return;
898 	if (static_cast<unsigned>(position- m_utfbuf) > m_strbuf->byteLength ())
899 	{
900 		m_utfptr = m_utfbuf + m_strbuf->byteLength ();
901 	}
902 	else
903 	{
904 		m_utfptr = position;
905 	}
906 }
907 
current()908 const char * UT_UTF8Stringbuf::UTF8Iterator::current ()
909 {
910 	if (!sync ()) return 0;
911 	if ((*m_utfptr & 0xc0) == 0x80) return 0; // oops - a 'continuing' byte
912 	return m_utfptr;
913 }
914 
start()915 const char * UT_UTF8Stringbuf::UTF8Iterator::start ()
916 {
917 	if (!sync ())
918 		return 0;
919 	return m_utfbuf;
920 }
921 
end()922 const char * UT_UTF8Stringbuf::UTF8Iterator::end ()
923 {
924 	if (!sync ())
925 		return 0;
926 	return m_utfbuf + m_strbuf->byteLength ();
927 }
928 
advance()929 const char * UT_UTF8Stringbuf::UTF8Iterator::advance ()
930 {
931 	if (!sync ())
932 		return 0;
933 	if (*m_utfptr == 0)
934 		return 0;
935 	do {
936 		m_utfptr++;
937 	} while ((*m_utfptr & 0xc0) == 0x80); // a 'continuing' byte
938 	return m_utfptr;
939 }
940 
retreat()941 const char * UT_UTF8Stringbuf::UTF8Iterator::retreat ()
942 {
943 	if (!sync ())
944 		return 0;
945 	if (m_utfptr == m_utfbuf)
946 		return 0;
947 	do {
948 		m_utfptr--;
949 	} while ((*m_utfptr & 0xc0) == 0x80); // a 'continuing' byte
950 	return m_utfptr;
951 }
952 
953 // returns false only if there is no string data
sync()954 bool UT_UTF8Stringbuf::UTF8Iterator::sync ()
955 {
956 	if (m_strbuf == 0)
957 		return false;
958 
959 	const char * utf8_buffer = m_strbuf->data ();
960 	if (utf8_buffer == 0)
961 	{
962 		m_utfbuf = 0;
963 		m_utfptr = 0;
964 		return false;
965 	}
966 
967 	size_t utf8_length = m_strbuf->byteLength ();
968 
969 	/* note that this doesn't guarantee that m_utfptr points to the
970 	 * start of UTF-8 char sequence
971 	 */
972 	if (static_cast<unsigned>(m_utfptr- m_utfbuf) > utf8_length)
973 	{
974 		m_utfptr = utf8_buffer + utf8_length;
975 	}
976 	else
977 	{
978 		m_utfptr = utf8_buffer + (m_utfptr - m_utfbuf);
979 	}
980 	m_utfbuf = utf8_buffer;
981 
982 	return true;
983 }
984 
985 
986 
987