1 /* -*- mode: C++; tab-width: 4; c-basic-offset: 4; -*- */
2
3 // UT_Stringbuf.cpp
4
5 // Copyright (C) 2001 Mike Nordell <tamlin@algonet.se>
6 // Copyright (c) 2007 Hubert Figuiere <hub@figuiere.net>
7 //
8 // This class is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU General Public License
10 // as published by the Free Software Foundation; either version 2
11 // of the License, or (at your option) any later version.
12 //
13 // This class is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // GNU General Public License for more details.
17 //
18 // You should have received a copy of the GNU General Public License
19 // along with this program; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 // 02110-1301 USA.
22 //
23 #include <stdlib.h>
24 #include <ctype.h>
25 #include <stdio.h>
26 #include <algorithm>
27
28 #include <libxml/uri.h>
29
30 #include <glib.h>
31
32 #include "ut_string.h"
33 #include "ut_stringbuf.h"
34 #include "ut_unicode.h"
35 #include "ut_string_class.h"
36 #include "ut_assert.h"
37 #include "ut_debugmsg.h"
38
39 // these classes keep zero terminated strings.
40 // if size() != 0, capacity() is always at least size() + 1.
41
42 //////////////////////////////////////////////////////////////////
43
44
45
46
47
48 ////////////////////////////////////////////////////////////////////////
49 //
50 // UTF-8 string: encoding is *always* UTF-8
51 //
52 ////////////////////////////////////////////////////////////////////////
53
54
UT_UTF8Stringbuf()55 UT_UTF8Stringbuf::UT_UTF8Stringbuf () :
56 m_psz(0),
57 m_pEnd(0),
58 m_strlen(0),
59 m_buflen(0)
60 {
61 //
62 }
63
UT_UTF8Stringbuf(const UT_UTF8Stringbuf & rhs)64 UT_UTF8Stringbuf::UT_UTF8Stringbuf (const UT_UTF8Stringbuf & rhs) :
65 m_psz(0),
66 m_pEnd(0),
67 m_strlen(0),
68 m_buflen(0)
69 {
70 append (rhs);
71 }
72
UT_UTF8Stringbuf(const char * sz,size_t n)73 UT_UTF8Stringbuf::UT_UTF8Stringbuf (const char * sz, size_t n /* == 0 => null-termination */) :
74 m_psz(0),
75 m_pEnd(0),
76 m_strlen(0),
77 m_buflen(0)
78 {
79 append (sz, n);
80 }
81
~UT_UTF8Stringbuf()82 UT_UTF8Stringbuf::~UT_UTF8Stringbuf ()
83 {
84 clear ();
85 }
86
operator =(const UT_UTF8Stringbuf & rhs)87 void UT_UTF8Stringbuf::operator=(const UT_UTF8Stringbuf & rhs)
88 {
89 m_pEnd = m_psz;
90 m_strlen = 0;
91 append (rhs);
92 }
93
assign(const char * sz,size_t n)94 void UT_UTF8Stringbuf::assign (const char * sz, size_t n /* == 0 => null-termination */)
95 {
96 m_pEnd = m_psz;
97 m_strlen = 0;
98 append (sz, n);
99 }
100
101 // returns 0 if invalid, or if end of string, i.e. 0
102 // technically it could differentiate, since UCS-4 is only 31-bit, but...
charCode(const char * str)103 UT_UTF8Stringbuf::UCS4Char UT_UTF8Stringbuf::charCode (const char * str)
104 {
105 if ( str == 0) return 0;
106 if (*str == 0) return 0;
107
108 const char * p = str;
109
110 if ((*p & 0x80) == 0x00) // plain us-ascii part of latin-1
111 {
112 return (UCS4Char) (*p);
113 }
114
115 UCS4Char ret_code = 0;
116
117 int bytesInSequence = 0;
118 int bytesExpectedInSequence = 0;
119
120 while (*p)
121 {
122 // 'continuing' octets:
123 if ((*p & 0xc0) == 0x80) // trailing byte in multi-byte sequence
124 {
125 if (bytesInSequence == 0) break;
126 bytesInSequence++;
127
128 ret_code = (ret_code << 6) | (UCS4Char) (*p & 0x3f);
129
130 if (bytesInSequence == bytesExpectedInSequence) break;
131
132 p++;
133 continue;
134 }
135
136 if (bytesInSequence) break;
137 bytesInSequence++;
138
139 /* 4,5,6-byte sequences may require > 2 bytes in UCS-4
140 */
141 if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence
142 {
143 bytesExpectedInSequence = 6;
144 ret_code = (UCS4Char) (*p & 0x01);
145 p++;
146 continue;
147 }
148 if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence
149 {
150 bytesExpectedInSequence = 5;
151 ret_code = (UCS4Char) (*p & 0x03);
152 p++;
153 continue;
154 }
155 if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence
156 {
157 bytesExpectedInSequence = 4;
158 ret_code = (UCS4Char) (*p & 0x07);
159 p++;
160 continue;
161 }
162
163 /* 1,2,3-byte sequences do not require > 2 bytes in UCS-4
164 */
165 if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence
166 {
167 bytesExpectedInSequence = 3;
168 ret_code = (UCS4Char) (*p & 0x0f);
169 p++;
170 continue;
171 }
172 if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence
173 {
174 bytesExpectedInSequence = 2;
175 ret_code = (UCS4Char) (*p & 0x1f);
176 p++;
177 continue;
178 }
179
180 ret_code = 0;
181 break; // invalid byte - not UTF-8
182 }
183 if (bytesInSequence != bytesExpectedInSequence) ret_code = 0;
184
185 return ret_code;
186 }
187
append(const char * sz,size_t n)188 void UT_UTF8Stringbuf::append (const char * sz, size_t n /* == 0 => null-termination */)
189 {
190 if (sz == 0)
191 return;
192 if (!grow ((n?n:strlen(sz)) + 1))
193 return;
194
195 const char * p = sz;
196 char buf[6];
197 int bytesInSequence = 0;
198 int bytesExpectedInSequence = 0;
199 size_t np = 0;
200
201 while ((!n && *p) || (np < n))
202 {
203 if ((*p & 0x80) == 0x00) // plain us-ascii part of latin-1
204 {
205 if (bytesInSequence) break;
206
207 *m_pEnd++ = *p;
208 *m_pEnd = 0;
209 m_strlen++;
210
211 p++;
212 np++;
213 continue;
214 }
215
216 // 'continuing' octets:
217 if ((*p & 0xc0) == 0x80) // trailing byte in multi-byte sequence
218 {
219 if (bytesInSequence == 0) break;
220
221 buf[bytesInSequence++] = *p;
222 if (bytesInSequence == bytesExpectedInSequence)
223 {
224 for (int b = 0; b < bytesInSequence; b++) *m_pEnd++ = buf[b];
225 *m_pEnd = 0;
226 m_strlen++;
227 bytesInSequence = 0;
228 bytesExpectedInSequence = 0;
229 }
230
231 p++;
232 np++;
233 continue;
234 }
235
236 if (bytesInSequence) break;
237
238 buf[bytesInSequence++] = *p;
239
240 /* 4,5,6-byte sequences may require > 2 bytes in UCS-4
241 */
242 if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence
243 {
244 bytesExpectedInSequence = 6;
245 p++;
246 np++;
247 continue;
248 }
249 if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence
250 {
251 bytesExpectedInSequence = 5;
252 p++;
253 np++;
254 continue;
255 }
256 if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence
257 {
258 bytesExpectedInSequence = 4;
259 p++;
260 np++;
261 continue;
262 }
263
264 /* 1,2,3-byte sequences do not require > 2 bytes in UCS-4
265 */
266 if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence
267 {
268 bytesExpectedInSequence = 3;
269 p++;
270 np++;
271 continue;
272 }
273 if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence
274 {
275 bytesExpectedInSequence = 2;
276 p++;
277 np++;
278 continue;
279 }
280
281 break; // invalid byte - not UTF-8
282 }
283 }
284
append(const UT_UTF8Stringbuf & rhs)285 void UT_UTF8Stringbuf::append (const UT_UTF8Stringbuf & rhs)
286 {
287 if (grow (rhs.byteLength () + 1))
288 {
289 memcpy (m_pEnd, rhs.data (), rhs.byteLength ());
290 m_strlen += rhs.utf8Length ();
291 m_pEnd = m_pEnd + rhs.byteLength ();
292 *m_pEnd = 0;
293 }
294 }
295
appendUCS4(const UT_UCS4Char * sz,size_t n)296 void UT_UTF8Stringbuf::appendUCS4 (const UT_UCS4Char * sz, size_t n /* == 0 => null-termination */)
297 {
298 size_t bytelength = 0;
299 size_t i;
300
301 if (!sz || (!n && !*sz))
302 return;
303
304 /* The vast majority of calls to appendUCS4 pass in
305 1 for n, so we can halve the number of calls to g_unichar_to_utf8
306 (in most cases) by caching the first byte length. */
307 int iCache = 0;
308
309 for (i = 0; (i < n) || (n == 0); i++)
310 {
311 if((0 == sz[i]) && (0 == n))
312 break;
313 int seql = UT_Unicode::UTF8_ByteLength (sz[i]);
314 if(i == 0)
315 iCache = seql;
316
317 if (seql < 0)
318 continue; // not UCS-4 !!
319 if (seql == 0)
320 break; // end-of-string?
321 bytelength += static_cast<size_t>(seql);
322 }
323 if(bytelength == 0)
324 return;
325 if (!grow (bytelength + 1)) return;
326
327 for (i = 0; (i < n) || (n == 0); i++)
328 {
329 if((0 == sz[i]) && (0 == n))
330 break;
331 int seql;
332 if(i == 0)
333 seql = iCache;
334 else
335 seql = UT_Unicode::UTF8_ByteLength (sz[i]);
336
337 if (seql < 0)
338 continue; // not UCS-4 !!
339 if (seql == 0)
340 break; // end-of-string?
341 UT_Unicode::UCS4_to_UTF8 (m_pEnd, bytelength, sz[i]);
342 m_strlen++;
343 }
344 *m_pEnd = 0;
345 }
346
appendUCS2(const UT_UCS2Char * sz,size_t n)347 void UT_UTF8Stringbuf::appendUCS2 (const UT_UCS2Char * sz, size_t n /* == 0 => null-termination */)
348 {
349 size_t bytelength = 0;
350 size_t i;
351 for (i = 0; (i < n) || (n == 0); i++)
352 {
353 if (sz[i]==0 && n==0) break;
354 int seql = UT_Unicode::UTF8_ByteLength ((UT_UCS4Char)sz[i]);
355 if (seql < 0)
356 continue; // not UCS-4 !!
357 if (seql == 0)
358 break; // end-of-string?
359 bytelength += static_cast<size_t>(seql);
360 }
361
362 if (!grow (bytelength + 1)) return;
363
364 for (i = 0; (i < n) || (n == 0); i++)
365 {
366 if (sz[i]==0 && n==0) break;
367 int seql = UT_Unicode::UTF8_ByteLength ((UT_UCS4Char)sz[i]);
368 if (seql < 0)
369 continue; // not UCS-4 !!
370 if (seql == 0)
371 break; // end-of-string?
372 UT_Unicode::UCS4_to_UTF8 (m_pEnd, bytelength, (UT_UCS4Char)sz[i]);
373 m_strlen++;
374 }
375 *m_pEnd = 0;
376 }
377
378 /* replaces <str1> with <str2> in the current string
379 */
escape(const UT_UTF8String & utf8_str1,const UT_UTF8String & utf8_str2)380 void UT_UTF8Stringbuf::escape (const UT_UTF8String & utf8_str1,
381 const UT_UTF8String & utf8_str2)
382 {
383 size_t diff = 0;
384 size_t len1 = utf8_str1.byteLength ();
385 size_t len2 = utf8_str2.byteLength ();
386
387 const char * str1 = utf8_str1.utf8_str ();
388 const char * str2 = utf8_str2.utf8_str ();
389
390 if (len2 > len1)
391 {
392 diff = len2 - len1;
393
394 size_t incr = 0;
395
396 char * ptr = m_psz;
397 while (ptr + len1 <= m_pEnd)
398 {
399 if (memcmp (ptr, str1, len1) == 0)
400 {
401 incr += diff;
402 ptr += len1;
403 }
404 else
405 {
406 ++ptr;
407 }
408 }
409 if (!grow (incr)) return;
410 }
411 else
412 {
413 diff = len1 - len2;
414 }
415
416 char * ptr = m_psz;
417 while (ptr + len1 <= m_pEnd)
418 {
419 if (memcmp (ptr, str1, len1) == 0)
420 {
421 if (diff)
422 {
423 if (len2 > len1)
424 {
425 memmove (ptr + diff, ptr, m_pEnd - ptr + 1);
426 m_pEnd += diff;
427 }
428 else
429 {
430 memmove (ptr, ptr + diff, m_pEnd - (ptr + diff) + 1);
431 m_pEnd -= diff;
432 }
433 }
434 memcpy (ptr, str2, len2);
435 ptr += len2;
436 m_strlen += utf8_str2.length () - utf8_str1.length ();
437 }
438 else
439 {
440 ++ptr;
441 }
442 }
443 }
444
445 /* FIXME -- these functions assume that &, <, > and " cannot appear in
446 * multi-byte utf8 sequence -- I do not think that holds
447 *
448 * Also, the decode function should handle other & tokens
449 *
450 * Should use glib to traverse these strings
451 */
decodeXML()452 void UT_UTF8Stringbuf::decodeXML ()
453 {
454 if (!m_psz)
455 return;
456
457 size_t shrink = 0;
458 char * p_src = m_psz;
459 char * p_dst = m_psz;
460
461 while (p_src < m_pEnd && *p_src)
462 {
463 if(*p_src == '&')
464 {
465 if (!strncmp (p_src+1, "amp;", 4))
466 {
467 *p_dst++ = '&';
468 p_src += 5;
469 shrink += 4;
470 continue;
471 }
472 else if (!strncmp (p_src+1, "lt;", 3))
473 {
474 *p_dst++ = '<';
475 p_src += 4;
476 shrink += 3;
477 continue;
478 }
479 else if (!strncmp (p_src+1, "gt;", 3))
480 {
481 *p_dst++ = '>';
482 p_src += 4;
483 shrink += 3;
484 continue;
485 }
486 else if (!strncmp (p_src+1, "quot;", 5))
487 {
488 *p_dst++ = '"';
489 p_src += 6;
490 shrink += 5;
491 continue;
492 }
493 }
494
495 *p_dst = *p_src;
496
497 p_dst++;
498 p_src++;
499 }
500
501 *p_dst = 0;
502 m_pEnd -= shrink;
503 }
504
505 /* escapes '<', '>', '\"' and '&' in the current string
506 */
escapeXML()507 void UT_UTF8Stringbuf::escapeXML ()
508 {
509 size_t incr = 0;
510
511 char * ptr = m_psz;
512 while (ptr < m_pEnd)
513 {
514 if ((*ptr == '<') || (*ptr == '>')) incr += 3;
515 else if (*ptr == '&') incr += 4;
516 else if (*ptr == '"') incr += 5;
517 ptr++;
518 }
519 bool bInsert = grow (incr);
520
521 ptr = m_psz;
522 while (ptr < m_pEnd)
523 {
524 if (*ptr == '<')
525 {
526 if (bInsert)
527 {
528 *ptr++ = '&';
529 insert (ptr, "lt;", 3);
530 }
531 else *ptr++ = '?';
532 }
533 else if (*ptr == '>')
534 {
535 if (bInsert)
536 {
537 *ptr++ = '&';
538 insert (ptr, "gt;", 3);
539 }
540 else *ptr++ = '?';
541 }
542 else if (*ptr == '&')
543 {
544 if (bInsert)
545 {
546 *ptr++ = '&';
547 insert (ptr, "amp;", 4);
548 }
549 else *ptr++ = '?';
550 }
551 else if (*ptr == '"')
552 {
553 if (bInsert)
554 {
555 *ptr++ = '&';
556 insert (ptr, "quot;", 5);
557 }
558 else *ptr++ = '?';
559 }
560 else ptr++;
561 }
562 }
563
564 /*
565 this function escapes the string to provide for conformity with
566 http://www.w3.org/TR/xlink/#link-locators, section 5.4
567
568 Just use libxml and hope for the best.
569 */
escapeURL()570 void UT_UTF8Stringbuf::escapeURL ()
571 {
572 if(!m_psz || !*m_psz)
573 return;
574
575 xmlChar * uri = xmlURIEscape(BAD_CAST m_psz);
576 if(uri) {
577 assign((gchar*)uri);
578 xmlFree(uri);
579 }
580 }
581
582 /* decode %xx encoded characters
583 */
584
s_charCode_to_hexval(UT_UCS4Char c)585 static UT_uint32 s_charCode_to_hexval(UT_UCS4Char c)
586 {
587 if(c >= 0x30 && c <= 0x39)
588 return c - 0x30;
589 else if(c >= 0x41 && c <= 0x46)
590 return c - 0x41 + 10;
591 else if(c >= 0x61 && c <= 0x66)
592 return c - 0x61 + 10;
593
594 UT_return_val_if_fail( UT_SHOULD_NOT_HAPPEN, 0 );
595 }
596
decodeURL()597 void UT_UTF8Stringbuf::decodeURL()
598 {
599 if(!m_psz || !*m_psz)
600 return;
601
602 char * buff = (char*)g_try_malloc(byteLength() + 1);
603 UT_return_if_fail( buff );
604 buff[0] = 0;
605
606 UTF8Iterator J(this);
607 const char * ptr = J.current();
608 UT_UCS4Char c = charCode(J.current());
609
610 char utf8cache[7]; utf8cache[6] = 0;
611 UT_uint32 iCachePos = 0;
612 UT_uint32 iCacheNeeded = 0;
613
614
615 while (c != 0)
616 {
617 if(c == '%')
618 {
619 J.advance();
620 UT_UCS4Char b1 = charCode(J.current());
621 J.advance();
622 UT_UCS4Char b2 = charCode(J.current());
623 J.advance();
624
625 if(isalnum(b1) && isalnum(b2))
626 {
627 b1 = s_charCode_to_hexval(b1);
628 b2 = s_charCode_to_hexval(b2);
629
630 UT_UCS4Char code = ((b1 << 4)& 0xf0) | (b2 & 0x0f);
631
632 if(iCacheNeeded == 0)
633 {
634 // we start new utf8 sequence in the cache
635 if ((code & 0x80) == 0) iCacheNeeded = 1;
636 else if ((code & 0xe0) == 0xc0) iCacheNeeded = 2;
637 else if ((code & 0xf0) == 0xe0) iCacheNeeded = 3;
638 else if ((code & 0xf8) == 0xf0) iCacheNeeded = 4;
639 else if ((code & 0xfc) == 0xf8) iCacheNeeded = 5;
640 else if ((code & 0xfe) == 0xfc) iCacheNeeded = 6;
641
642 utf8cache[0] = (char) code;
643 utf8cache[iCacheNeeded] = 0; // make sure the sequence will be terminated
644 iCachePos++;
645 }
646 else
647 {
648 // append to our cache
649 utf8cache[iCachePos++] = (char) code;
650 }
651
652 if(iCacheNeeded == 0 && (code >= 0x7f && code <= 0xff))
653 {
654 // the present character is not a valid start of utf8 sequence --
655 // this is almost certainly a character from the extended ASCII set
656 // which was encoded directly according to the RFC 1738 scheme, we
657 // just append it
658
659 size_t iLenBuff = strlen(buff);
660 size_t iLenLeft = byteLength() - iLenBuff;
661
662 char * p = buff + iLenBuff;
663 UT_Unicode::UCS4_to_UTF8(p, iLenLeft, code);
664
665 // we need to null-terminate
666 *p = 0;
667 }
668
669 if(iCacheNeeded && iCacheNeeded <= iCachePos)
670 {
671 UT_ASSERT_HARMLESS( iCacheNeeded == iCachePos );
672
673 // append the cache to our buffer
674 UT_uint32 iLenBuff = strlen(buff);
675 char * p = buff + iLenBuff;
676 strcat(p, utf8cache);
677
678 iCacheNeeded = iCachePos = 0;
679 }
680 }
681 else
682 {
683 // this should not happen in encoded url and so we will ignore this token;
684 // if we are in the middle of utf8 sequence; we will reset it
685 iCacheNeeded = iCachePos = 0;
686 }
687 }
688 else
689 {
690 J.advance(); // advance here, for the sake of the else clause below
691
692 if(iCacheNeeded > iCachePos)
693 {
694 // we are processing a utf sequence, so just append this byte to our cache
695 utf8cache[iCachePos++] = (char) c;
696 }
697 else
698 {
699 const char * p = J.current();
700 UT_uint32 iLen = p ? p - ptr : strlen(ptr);
701 strncat(buff, ptr, iLen);
702 }
703 }
704
705 ptr = J.current();
706 c = charCode(J.current());
707 }
708
709 assign(buff);
710 g_free(buff);
711 }
712
713 /* translates the current string to MIME "quoted-printable" format
714 */
escapeMIME()715 void UT_UTF8Stringbuf::escapeMIME ()
716 {
717 static const char hex[16] = { '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' };
718 static const char * s_eol = "=\r\n";
719
720 if (m_strlen == 0) return;
721
722 size_t bytes = 0;
723 char * ptr = m_psz;
724 while (*ptr)
725 {
726 char c = *ptr++;
727 unsigned char u = static_cast<unsigned char>(c);
728
729 if ((c == '\r') || (c == '\n') || (c == '=') || (u & 0x80)) bytes += 2;
730 }
731 if (bytes)
732 {
733 if (!grow (bytes)) return;
734
735 char * pOld = m_pEnd;
736 char * pNew = m_pEnd + bytes;
737
738 while (pOld >= m_psz)
739 {
740 char c = *pOld--;
741 unsigned char u = static_cast<unsigned char>(c);
742
743 if ((u & 0x80) || (c == '\r') || (c == '\n') || (c == '='))
744 {
745 *pNew-- = hex[ u & 0x0f];
746 *pNew-- = hex[(u >> 4) & 0x0f];
747 *pNew-- = '=';
748 }
749 else *pNew-- = c;
750 }
751 m_pEnd += bytes;
752 m_strlen = m_pEnd - m_psz;
753 }
754
755 size_t length = 0;
756 ptr = m_psz;
757 while (true)
758 {
759 if (*ptr == 0)
760 {
761 if (length)
762 {
763 size_t offset = ptr - m_psz;
764 if (grow (3))
765 {
766 ptr = m_psz + offset;
767 insert (ptr, s_eol, 3);
768 }
769 }
770 break;
771 }
772 if (length >= 70)
773 {
774 size_t offset = ptr - m_psz;
775 if (grow (3))
776 {
777 ptr = m_psz + offset;
778 insert (ptr, s_eol, 3);
779 }
780 length = 0;
781 }
782
783 if (*ptr == '=')
784 {
785 ptr += 3;
786 length += 3;
787 }
788 else
789 {
790 ptr++;
791 length++;
792 }
793 }
794 }
795
lowerCase()796 UT_UTF8Stringbuf * UT_UTF8Stringbuf::lowerCase ()
797 {
798 if(!byteLength())
799 return NULL;
800
801 UT_UTF8Stringbuf * n = new UT_UTF8Stringbuf();
802 UT_return_val_if_fail(n, NULL);
803
804 UTF8Iterator s(this);
805 UT_UCS4Char c = charCode(s.current());
806
807 while(c)
808 {
809 UT_UCS4Char l = UT_UCS4_tolower(c);
810 n->appendUCS4(&l,1);
811 c = charCode(s.advance());
812 }
813
814 return n;
815 }
816
clear()817 void UT_UTF8Stringbuf::clear ()
818 {
819 if (m_psz) g_free (m_psz);
820 m_psz = 0;
821 m_pEnd = 0;
822 m_strlen = 0;
823 m_buflen = 0;
824 }
825
insert(char * & ptr,const char * str,size_t utf8length)826 void UT_UTF8Stringbuf::insert (char *& ptr, const char * str, size_t utf8length)
827 {
828 if ( str == 0) return;
829 if (*str == 0) return;
830
831 if ((ptr < m_psz) || (ptr > m_pEnd)) return;
832
833 char * orig_buf = m_psz;
834 char * orig_ptr = ptr;
835
836 size_t length = static_cast<size_t>(strlen(str));
837
838 if (!grow (length)) return;
839
840 ptr = m_psz + (orig_ptr - orig_buf);
841
842 memmove (ptr + length, ptr, (m_pEnd - ptr) + 1);
843 memcpy (ptr, str, length);
844
845 ptr += length;
846 m_pEnd += length;
847 m_strlen += utf8length;
848 }
849
reserve(size_t n)850 void UT_UTF8Stringbuf::reserve(size_t n)
851 {
852 grow(n);
853 }
854
grow(size_t length)855 bool UT_UTF8Stringbuf::grow (size_t length)
856 {
857 if (length + 1 <= (m_buflen - (m_pEnd - m_psz))) return true;
858
859 if (m_psz == 0)
860 {
861 if (length == 0) return true;
862 m_psz = static_cast<char *>(g_try_malloc(length));
863 if (m_psz == 0) return false;
864 m_strlen = 0;
865 m_buflen = length;
866 m_pEnd = m_psz;
867 *m_pEnd = 0;
868 return true;
869 }
870
871 size_t new_length = length + (m_pEnd - m_psz) + 1;
872 size_t end_offset = m_pEnd - m_psz;
873
874 char * more = static_cast<char *>(g_try_realloc(static_cast<void *>(m_psz), new_length));
875 if (more == 0) return false;
876 m_psz = more;
877 m_pEnd = m_psz + end_offset;
878 m_buflen = new_length;
879 return true;
880 }
881
UTF8Iterator(const UT_UTF8Stringbuf * strbuf)882 UT_UTF8Stringbuf::UTF8Iterator::UTF8Iterator (const UT_UTF8Stringbuf * strbuf) :
883 m_strbuf(strbuf),
884 m_utfbuf(0),
885 m_utfptr(0)
886 {
887 sync ();
888 }
889
~UTF8Iterator()890 UT_UTF8Stringbuf::UTF8Iterator::~UTF8Iterator ()
891 {
892 //
893 }
894
operator =(const char * position)895 void UT_UTF8Stringbuf::UTF8Iterator::operator=(const char * position)
896 {
897 if (!sync ()) return;
898 if (static_cast<unsigned>(position- m_utfbuf) > m_strbuf->byteLength ())
899 {
900 m_utfptr = m_utfbuf + m_strbuf->byteLength ();
901 }
902 else
903 {
904 m_utfptr = position;
905 }
906 }
907
current()908 const char * UT_UTF8Stringbuf::UTF8Iterator::current ()
909 {
910 if (!sync ()) return 0;
911 if ((*m_utfptr & 0xc0) == 0x80) return 0; // oops - a 'continuing' byte
912 return m_utfptr;
913 }
914
start()915 const char * UT_UTF8Stringbuf::UTF8Iterator::start ()
916 {
917 if (!sync ())
918 return 0;
919 return m_utfbuf;
920 }
921
end()922 const char * UT_UTF8Stringbuf::UTF8Iterator::end ()
923 {
924 if (!sync ())
925 return 0;
926 return m_utfbuf + m_strbuf->byteLength ();
927 }
928
advance()929 const char * UT_UTF8Stringbuf::UTF8Iterator::advance ()
930 {
931 if (!sync ())
932 return 0;
933 if (*m_utfptr == 0)
934 return 0;
935 do {
936 m_utfptr++;
937 } while ((*m_utfptr & 0xc0) == 0x80); // a 'continuing' byte
938 return m_utfptr;
939 }
940
retreat()941 const char * UT_UTF8Stringbuf::UTF8Iterator::retreat ()
942 {
943 if (!sync ())
944 return 0;
945 if (m_utfptr == m_utfbuf)
946 return 0;
947 do {
948 m_utfptr--;
949 } while ((*m_utfptr & 0xc0) == 0x80); // a 'continuing' byte
950 return m_utfptr;
951 }
952
953 // returns false only if there is no string data
sync()954 bool UT_UTF8Stringbuf::UTF8Iterator::sync ()
955 {
956 if (m_strbuf == 0)
957 return false;
958
959 const char * utf8_buffer = m_strbuf->data ();
960 if (utf8_buffer == 0)
961 {
962 m_utfbuf = 0;
963 m_utfptr = 0;
964 return false;
965 }
966
967 size_t utf8_length = m_strbuf->byteLength ();
968
969 /* note that this doesn't guarantee that m_utfptr points to the
970 * start of UTF-8 char sequence
971 */
972 if (static_cast<unsigned>(m_utfptr- m_utfbuf) > utf8_length)
973 {
974 m_utfptr = utf8_buffer + utf8_length;
975 }
976 else
977 {
978 m_utfptr = utf8_buffer + (m_utfptr - m_utfbuf);
979 }
980 m_utfbuf = utf8_buffer;
981
982 return true;
983 }
984
985
986
987