1 /* AbiSource Program Utilities
2 * Copyright (C) 1998 AbiSource, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA.
18 */
19
20 #ifdef HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <algorithm>
28 #include <math.h>
29 #include <ctype.h>
30
31 #include "ut_types.h"
32 #include "ut_misc.h"
33 #include "ut_assert.h"
34 #include "ut_string.h"
35 #include "ut_debugmsg.h"
36 #include "ut_growbuf.h"
37 #include <fribidi.h>
38 #include "ut_mbtowc.h"
39 #include "ut_wctomb.h"
40
41 #include "ut_string_class.h"
42
43 #include "xap_EncodingManager.h"
44
45 #define UT_STRING_CPP
46 #include "ut_case.h"
47 #undef UT_STRING_CPP
48
UT_XML_cloneNoAmpersands(gchar * & rszDest,const gchar * szSource)49 bool UT_XML_cloneNoAmpersands(gchar *& rszDest, const gchar * szSource)
50 {
51 if (szSource == NULL)
52 return false;
53
54 UT_uint32 length = strlen(szSource) + 1;
55 rszDest = static_cast<gchar *>(UT_calloc(length, sizeof(gchar)));
56
57 if (!rszDest)
58 return false;
59
60 const gchar * o = szSource;
61 gchar * n = rszDest;
62 while (*o != 0)
63 {
64 if (*o != '&')
65 {
66 *n = *o;
67 n++;
68 }
69 o++;
70 }
71
72 return true;
73 }
74
UT_XML_cloneConvAmpersands(gchar * & rszDest,const gchar * szSource)75 bool UT_XML_cloneConvAmpersands(gchar *& rszDest, const gchar * szSource)
76 {
77 if (szSource == NULL)
78 return false;
79
80 UT_uint32 length = strlen(szSource) + 1;
81 rszDest = static_cast<gchar *>(UT_calloc(length, sizeof(gchar)));
82
83 if (!rszDest)
84 return false;
85
86 const gchar * o = szSource;
87 gchar * n = rszDest;
88 while (*o != 0)
89 {
90 if (*o != '&')
91 {
92 *n = *o;
93 } else {
94 if (o[1] == '&') {
95 *n++ = '&';
96 }
97 else *n = '_';
98 }
99 n++; o++;
100 }
101
102 return true;
103 }
104
105 /* This uses the clone no ampersands but dumps into a static buffer */
UT_XML_transNoAmpersands(const gchar * szSource)106 const gchar *UT_XML_transNoAmpersands(const gchar * szSource)
107 {
108 static gchar *rszDestBuffer = NULL;
109 static UT_uint32 iDestBufferLength = 0;
110
111 if (szSource == NULL)
112 return NULL;
113
114 UT_uint32 length = strlen(szSource) + 1;
115 if (length > iDestBufferLength) {
116 if (rszDestBuffer && iDestBufferLength) {
117 g_free(rszDestBuffer);
118 }
119 iDestBufferLength = 0;
120 rszDestBuffer = static_cast<gchar *>(UT_calloc(length, sizeof(gchar)));
121
122 if (!rszDestBuffer)
123 return NULL;
124
125 iDestBufferLength = length;
126 }
127 memset(rszDestBuffer, 0, iDestBufferLength);
128
129 const gchar * o = szSource;
130 gchar * n = rszDestBuffer;
131 while (*o != 0)
132 {
133 if (*o != '&')
134 {
135 *n = *o;
136 n++;
137 }
138 o++;
139 }
140
141 return rszDestBuffer;
142 }
143
144 /*! \fn bool UT_isValidXML(const char *s)
145 \param s The string of characters which is to be checked for XML-validity.
146 \retval TRUE if the characters are all valid for XML, FALSE if any one of them is not.
147
148 NB: this function also checks that the string is valid utf-8
149 */
UT_isValidXML(const char * pString)150 bool UT_isValidXML(const char *pString)
151 {
152 if(!pString)
153 return true;
154
155 if(!g_utf8_validate(pString, -1, NULL))
156 return false;
157
158 const UT_Byte * s = reinterpret_cast<const UT_Byte *>(pString);
159
160 while(*s)
161 {
162 if(*s < ' ' && *s != '\t' && *s != '\n' && *s != '\r')
163 {
164 return false;
165 }
166
167 ++s;
168 }
169
170 return true;
171 }
172
173 /*!
174 XML cannot contain any control characters except \t, \n, \r, see bug 8565
175 (http://www.w3.org/TR/REC-xml/#charsets)
176
177 This function removes any illegal characters and invalid utf-8 sequences.
178
179 The return value of true indicates that the string was modified
180 */
UT_validXML(char * pString)181 bool UT_validXML(char * pString)
182 {
183 if(!pString)
184 return false;
185
186 UT_ASSERT(sizeof(gchar) == sizeof(UT_Byte));
187 const UT_Byte * p = reinterpret_cast<const UT_Byte *>(pString); // gchar is signed...
188
189 bool bChanged = false;
190 UT_uint32 len = strlen(pString);
191
192 int bytesInSequence = 0;
193 int bytesExpectedInSequence = 0;
194
195 UT_String s;
196 s.reserve(len);
197
198 for (UT_uint32 k=0; k<len; k++)
199 {
200 if (p[k] < 0x80) // plain us-ascii part of latin-1
201 {
202 if(bytesInSequence != 0)
203 bChanged = true;
204
205 // UT_Byte is unsigned char, hence p[k] always >= 0
206 if(p[k] < ' ' /*&& p[k] >= 0*/ && p[k] != '\t' && p[k] != '\n' && p[k] != '\r')
207 {
208 bChanged = true;
209 }
210 else
211 s += p[k];
212
213 bytesInSequence = 0;
214 bytesExpectedInSequence = 0;
215 }
216 else if ((p[k] & 0xf0) == 0xf0) // lead byte in 4-byte surrogate pair
217 {
218 if(bytesInSequence != 0)
219 bChanged = true;
220
221 UT_ASSERT_HARMLESS( UT_NOT_IMPLEMENTED );
222 bytesExpectedInSequence = 4;
223 bytesInSequence = 1;
224 }
225 else if ((p[k] & 0xe0) == 0xe0) // lead byte in 3-byte sequence
226 {
227 if(bytesInSequence != 0)
228 bChanged = true;
229
230 bytesExpectedInSequence = 3;
231 bytesInSequence = 1;
232 }
233 else if ((p[k] & 0xc0) == 0xc0) // lead byte in 2-byte sequence
234 {
235 if(bytesInSequence != 0)
236 bChanged = true;
237
238 bytesExpectedInSequence = 2;
239 bytesInSequence = 1;
240 }
241 else if ((p[k] & 0x80) == 0x80) // trailing byte in multi-byte sequence
242 {
243 bytesInSequence++;
244 if (bytesInSequence == bytesExpectedInSequence) // final byte in multi-byte sequence
245 {
246 for(UT_sint32 i = k - bytesInSequence + 1; i <= (UT_sint32)k; i++)
247 {
248 s += p[i];
249 }
250
251 bytesInSequence = 0;
252 bytesExpectedInSequence = 0;
253 }
254 }
255 }
256
257 strncpy(pString, s.c_str(), s.length());
258
259 // make sure we null-terminate
260 pString[s.length()] = 0;
261 return bChanged;
262 }
263
UT_decodeUTF8string(const gchar * pString,UT_uint32 len,UT_GrowBuf * pResult)264 void UT_decodeUTF8string(const gchar * pString, UT_uint32 len, UT_GrowBuf * pResult)
265 {
266 // decode the given string [ p[0]...p[len] ] and append to the given growbuf.
267
268 UT_ASSERT(sizeof(gchar) == sizeof(UT_Byte));
269 const UT_Byte * p = reinterpret_cast<const UT_Byte *>(pString); // gchar is signed...
270
271 int bytesInSequence = 0;
272 int bytesExpectedInSequence = 0;
273 gchar buf[5];
274
275 for (UT_uint32 k=0; k<len; k++)
276 {
277 if (p[k] < 0x80) // plain us-ascii part of latin-1
278 {
279 UT_ASSERT(bytesInSequence == 0);
280 UT_UCSChar c = p[k];
281 pResult->append(reinterpret_cast<UT_GrowBufElement *>(&c),1);
282 }
283 else if ((p[k] & 0xf0) == 0xf0) // lead byte in 4-byte surrogate pair
284 {
285 // surrogate pairs are defined in section 3.7 of the
286 // unicode standard version 2.0 as an extension
287 // mechanism for rare characters in future extensions
288 // of the unicode standard.
289 UT_ASSERT(bytesInSequence == 0);
290 bytesExpectedInSequence = 4;
291 buf[bytesInSequence++] = p[k];
292 }
293 else if ((p[k] & 0xe0) == 0xe0) // lead byte in 3-byte sequence
294 {
295 UT_ASSERT(bytesInSequence == 0);
296 bytesExpectedInSequence = 3;
297 buf[bytesInSequence++] = p[k];
298 }
299 else if ((p[k] & 0xc0) == 0xc0) // lead byte in 2-byte sequence
300 {
301 UT_ASSERT(bytesInSequence == 0);
302 bytesExpectedInSequence = 2;
303 buf[bytesInSequence++] = p[k];
304 }
305 else if ((p[k] & 0x80) == 0x80) // trailing byte in multi-byte sequence
306 {
307 UT_ASSERT(bytesInSequence > 0);
308 buf[bytesInSequence++] = p[k];
309 if (bytesInSequence == bytesExpectedInSequence) // final byte in multi-byte sequence
310 {
311 UT_UCSChar c = g_utf8_get_char(buf);
312 pResult->append(reinterpret_cast<UT_GrowBufElement *>(&c),1);
313 bytesInSequence = 0;
314 bytesExpectedInSequence = 0;
315 }
316 }
317 }
318 }
319
320 /*
321 The following code is from the GNU C library, version 2.0.6.
322 It has been reformatted and tweaked to do Unicode strstrs.
323 All this licensing stuff is kinda ugly, but I didn't want
324 to risk merging the licensing for fear I might break some law.
325 */
326
327 /* Copyright (C) 1994, 1996 Free Software Foundation, Inc.
328 This file is part of the GNU C Library.
329
330 The GNU C Library is free software; you can redistribute it and/or
331 modify it under the terms of the GNU Library General Public License as
332 published by the Free Software Foundation; either version 2 of the
333 License, or (at your option) any later version.
334
335 The GNU C Library is distributed in the hope that it will be useful,
336 but WITHOUT ANY WARRANTY; without even the implied warranty of
337 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
338 Library General Public License for more details.
339
340 You should have received a copy of the GNU Library General Public
341 License along with the GNU C Library; see the file COPYING.LIB. If not,
342 write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
343 Boston, MA 02110-1301 USA. */
344
345 ////////////////////////////////////////////////////////////////////////
346 //
347 // UCS-2 string (UT_UCS2Char)
348 //
349 // String is built of 16-bit units (words)
350 //
351 // TODO: Is this really UCS-2 or UTF-16?
352 // TODO: meaning, does it support surrogates or is it intended to
353 // TODO: support them at any time in the future?
354 // TODO: Correctly, UCS-2 does not support surrogates and UTF-16 does.
355 // TODO: BUT Microsoft calls their native Unicode encoding UCS-2
356 // TODO: while it supports surrogates and is thus really UTF-16.
357 // TODO: Surrogates are Unicode characters with codepoints above
358 // TODO: 65535 which cannot therefore fit into a 2-byte word.
359 // TODO: This means that TRUE UCS-2 is a single-word encoding and
360 // TODO: UTF-16 is a multi-word encoding.
361 //
362 // NOTE: We shouldn't actually need 16-bit strings anymore since
363 // NOTE: AbiWord is now fully converted to using 32-bit Unicode
364 // NOTE: internally. The only possible needs for this is for
365 // NOTE: Windows GUI, filesystem and API functions where applicable;
366 // NOTE: and perhaps some file formats or external libraries
367 //
368 ////////////////////////////////////////////////////////////////////////
369
370 // Don't ifdef out strlen since it's used by the MSWord importer...
371
372 // TODO is this really UCS-2 or UTF-16?
373 // TODO and are we using strlen for the number of 16-bit words
374 // TODO or the number of characters?
375 // TODO Because UTF-16 characters are sometimes expressed as 2 words
376
UT_UCS2_strlen(const UT_UCS2Char * string)377 UT_uint32 UT_UCS2_strlen(const UT_UCS2Char * string)
378 {
379 UT_uint32 i;
380
381 for(i = 0; *string != 0; string++, i++)
382 ;
383
384 return i;
385 }
386
387 #ifdef ENABLE_UCS2_STRINGS
388 /*
389 * My personal strstr() implementation that beats most other algorithms.
390 * Until someone tells me otherwise, I assume that this is the
391 * fastest implementation of strstr() in C.
392 * I deliberately chose not to comment it. You should have at least
393 * as much fun trying to understand it, as I had to write it :-).
394 *
395 * Stephen R. van den Berg, berg@pool.informatik.rwth-aachen.de */
396
UT_UCS2_strstr(const UT_UCS2Char * phaystack,const UT_UCS2Char * pneedle)397 UT_UCS2Char * UT_UCS2_strstr(const UT_UCS2Char * phaystack, const UT_UCS2Char * pneedle)
398 {
399 register const UT_UCS2Char *haystack, *needle;
400 register UT_UCS2Char b, c;
401
402 haystack = phaystack;
403 needle = pneedle;
404
405 b = *needle;
406 if (b != '\0')
407 {
408 haystack--; /* possible ANSI violation */
409 do
410 {
411 c = *++haystack;
412 if (c == '\0')
413 goto ret0;
414 }
415 while (c != b);
416
417 c = *++needle;
418 if (c == '\0')
419 goto foundneedle;
420 ++needle;
421 goto jin;
422
423 for (;;)
424 {
425 register UT_UCS2Char a;
426 register const UT_UCS2Char *rhaystack, *rneedle;
427
428 do
429 {
430 a = *++haystack;
431 if (a == '\0')
432 goto ret0;
433 if (a == b)
434 break;
435 a = *++haystack;
436 if (a == '\0')
437 goto ret0;
438 shloop: ; // need a statement here for EGCS 1.1.1 to accept it
439 }
440 while (a != b);
441
442 jin: a = *++haystack;
443 if (a == '\0')
444 goto ret0;
445
446 if (a != c)
447 goto shloop;
448
449 rhaystack = haystack-- + 1;
450 rneedle = needle;
451 a = *rneedle;
452
453 if (*rhaystack == a)
454 do
455 {
456 if (a == '\0')
457 goto foundneedle;
458 ++rhaystack;
459 a = *++needle;
460 if (*rhaystack != a)
461 break;
462 if (a == '\0')
463 goto foundneedle;
464 ++rhaystack;
465 a = *++needle;
466 }
467 while (*rhaystack == a);
468
469 needle = rneedle; /* took the register-poor approach */
470
471 if (a == '\0')
472 break;
473 }
474 }
475 foundneedle:
476 return static_cast<UT_UCS2Char *>(haystack);
477 ret0:
478 return 0;
479 }
480
UT_UCS2_strcmp(const UT_UCS2Char * left,const UT_UCS2Char * right)481 UT_sint32 UT_UCS2_strcmp(const UT_UCS2Char* left, const UT_UCS2Char* right)
482 {
483 UT_ASSERT(left);
484 UT_ASSERT(right);
485
486 while (*left && *right)
487 {
488 if (*left < *right)
489 {
490 return -1;
491 }
492
493 if (*left > *right)
494 {
495 return 1;
496 }
497
498 left++;
499 right++;
500 }
501
502 if (*left)
503 {
504 return -1;
505 }
506 else if (*right)
507 {
508 return 1;
509 }
510 else
511 {
512 return 0;
513 }
514 }
515
516 /*
517 Latin-1 Unicode case-insensitive string comparison and casing done by
518 Pierre Sarrazin <ps@cam.org>.
519 */
520
521 /**
522 * Convert a given character to uppercase
523 */
UT_UCS2_toupper(UT_UCS2Char c)524 UT_UCS2Char UT_UCS2_toupper(UT_UCS2Char c)
525 {
526 if (c < 128) // in ASCII range
527 return toupper(c);
528
529 if (XAP_EncodingManager::get_instance()->single_case())
530 return c;
531 /*let's trust libc! -- does not seem to work :(*/
532 case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
533 if(!letter || letter->type == 1)
534 return c;
535 return letter->other;
536 }
537
538
539 /* Converts the given character to lowercase if it is an uppercase letter.
540 Returns it unchanged if it is not.
541 This function created by Pierre Sarrazin 1999-02-06
542 */
543
UT_UCS2_tolower(UT_UCS2Char c)544 UT_UCS2Char UT_UCS2_tolower(UT_UCS2Char c)
545 {
546 if (c < 128)
547 return tolower(c);
548 if (XAP_EncodingManager::get_instance()->single_case())
549 return c;
550 /*let's trust libc!*/
551 case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
552 if(!letter || letter->type == 0)
553 return c;
554 return letter->other;
555 }
556
557
558 /* Characters are converted to lowercase (if applicable) when they
559 are read from the needle or the haystack. See UT_UCS_tolower().
560 This function created by Pierre Sarrazin 1999-02-06
561 */
562
UT_UCS2_stristr(const UT_UCS2Char * phaystack,const UT_UCS2Char * pneedle)563 UT_UCS2Char * UT_UCS2_stristr(const UT_UCS2Char * phaystack, const UT_UCS2Char * pneedle)
564 {
565 register const UT_UCS2Char *haystack, *needle;
566 register UT_UCS2Char b, c;
567
568 haystack = phaystack;
569 needle = pneedle;
570
571 b = UT_UCS2_tolower(*needle);
572 if (b != '\0')
573 {
574 haystack--; /* possible ANSI violation */
575 do
576 {
577 c = UT_UCS2_tolower(*++haystack);
578 if (c == '\0')
579 goto ret0;
580 }
581 while (c != b);
582
583 c = UT_UCS2_tolower(*++needle);
584 if (c == '\0')
585 goto foundneedle;
586 ++needle;
587 goto jin;
588
589 for (;;)
590 {
591 register UT_UCS2Char a;
592 register const UT_UCS2Char *rhaystack, *rneedle;
593
594 do
595 {
596 a = UT_UCS2_tolower(*++haystack);
597 if (a == '\0')
598 goto ret0;
599 if (a == b)
600 break;
601 a = UT_UCS2_tolower(*++haystack);
602 if (a == '\0')
603 goto ret0;
604 shloop: ; // need a statement here for EGCS 1.1.1 to accept it
605 }
606 while (a != b);
607
608 jin: a = UT_UCS2_tolower(*++haystack);
609 if (a == '\0')
610 goto ret0;
611
612 if (a != c)
613 goto shloop;
614
615 rhaystack = haystack-- + 1;
616 rneedle = needle;
617 a = UT_UCS2_tolower(*rneedle);
618
619 if (UT_UCS2_tolower(*rhaystack) == a)
620 do
621 {
622 if (a == '\0')
623 goto foundneedle;
624 ++rhaystack;
625 a = UT_UCS2_tolower(*++needle);
626 if (UT_UCS2_tolower(*rhaystack) != a)
627 break;
628 if (a == '\0')
629 goto foundneedle;
630 ++rhaystack;
631 a = UT_UCS2_tolower(*++needle);
632 }
633 while (UT_UCS2_tolower(*rhaystack) == a);
634
635 needle = rneedle; /* took the register-poor approach */
636
637 if (a == '\0')
638 break;
639 }
640 }
641 foundneedle:
642 return static_cast<UT_UCS2Char *>(haystack);
643 ret0:
644 return 0;
645 }
646 /****************************************************************************/
647
UT_UCS2_strcpy(UT_UCS2Char * dest,const UT_UCS2Char * src)648 UT_UCS2Char * UT_UCS2_strcpy(UT_UCS2Char * dest, const UT_UCS2Char * src)
649 {
650 UT_ASSERT(dest);
651 UT_ASSERT(src);
652
653 UT_UCS2Char * d = dest;
654 UT_UCS2Char * s = static_cast<UT_UCS2Char *>(src);
655
656 while (*s != 0)
657 *d++ = *s++;
658 *d = 0;
659
660 return dest;
661 }
662
UT_UCS2_strcpy_char(UT_UCS2Char * dest,const char * src)663 UT_UCS2Char * UT_UCS2_strcpy_char(UT_UCS2Char * dest, const char * src)
664 {
665 UT_ASSERT(dest);
666 UT_ASSERT(src);
667
668 UT_UCS2Char * d = dest;
669 unsigned char * s = static_cast<unsigned char *>(src);
670
671 static UT_UCS2_mbtowc m(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
672 UT_UCS2Char wc;
673
674 while (*s != 0)
675 {
676 if(m.mbtowc(wc,*s))*d++=wc;
677 s++;
678 }
679 *d = 0;
680
681 return dest;
682 }
683
UT_UCS2_strcpy_to_char(char * dest,const UT_UCS2Char * src)684 char * UT_UCS2_strcpy_to_char(char * dest, const UT_UCS2Char * src)
685 {
686 UT_ASSERT(dest);
687 UT_ASSERT(src);
688
689 UT_ASSERT_NOT_REACHED();
690
691 return NULL;
692 }
693
UT_UCS2_cloneString(UT_UCS2Char ** dest,const UT_UCS2Char * src)694 bool UT_UCS2_cloneString(UT_UCS2Char ** dest, const UT_UCS2Char * src)
695 {
696 UT_uint32 length = UT_UCS2_strlen(src) + 1;
697 *dest = static_cast<UT_UCS2Char *>(UT_calloc(length,sizeof(UT_UCS2Char)));
698 if (!*dest)
699 return false;
700 memmove(*dest,src,length*sizeof(UT_UCS2Char));
701
702 return true;
703 }
704
UT_UCS2_cloneString_char(UT_UCS2Char ** dest,const char * src)705 bool UT_UCS2_cloneString_char(UT_UCS2Char ** dest, const char * src)
706 {
707 UT_ASSERT_NOT_REACHED();
708 return false;
709 }
710
UT_UCS2_isupper(UT_UCS2Char c)711 bool UT_UCS2_isupper(UT_UCS2Char c)
712 {
713 if(c < 127)
714 return isupper(c)!=0;
715
716 case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
717 if(letter && letter->type == 1)
718 return true;
719 return false;
720 };
721
UT_UCS2_islower(UT_UCS2Char c)722 bool UT_UCS2_islower(UT_UCS2Char c)
723 {
724 if(c < 127)
725 return islower(c)!=0;
726
727 case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
728 if(!letter || letter->type == 0)
729 return true;
730 return false;
731 };
732
UT_UCS2_isspace(UT_UCS2Char c)733 bool UT_UCS2_isspace(UT_UCS2Char c)
734 {
735 // the whitespace table is small, so use linear search
736 for (UT_uint32 i = 0; i < G_N_ELEMENTS(whitespace_table); i++)
737 {
738 if(whitespace_table[i].high < c)
739 continue;
740 if(whitespace_table[i].low <= c)
741 return true;
742 // if we got here, then low > c
743 return false;
744 }
745 return false;
746 };
747
UT_UCS2_isalpha(UT_UCS2Char c)748 bool UT_UCS2_isalpha(UT_UCS2Char c)
749 {
750 UT_BidiCharType type = UT_bidiGetCharType(c);
751 return (UT_BIDI_IS_LETTER(type) != 0);
752 };
753
UT_UCS2_isSentenceSeparator(UT_UCS2Char c)754 bool UT_UCS2_isSentenceSeparator(UT_UCS2Char c)
755 {
756 switch(c)
757 {
758 case '?': // fall-through
759 case '!': // fall-through
760 case '.':
761 return true;
762
763 default:
764 return false;
765 }
766 }
767
768 /* copies exactly n-chars from src to dest; NB! does not check for 00 i src
769 */
UT_UCS2_strncpy(UT_UCS2Char * dest,const UT_UCS2Char * src,UT_uint32 n)770 UT_UCS2Char * UT_UCS2_strncpy(UT_UCS2Char * dest, const UT_UCS2Char * src, UT_uint32 n)
771 {
772 UT_ASSERT(dest);
773 UT_ASSERT(src);
774
775 UT_UCS2Char * d = dest;
776 UT_UCS2Char * s = static_cast<UT_UCS2Char *>(src);
777
778 for (; d < static_cast<UT_UCS2Char *>(dest) + n;)
779 *d++ = *s++;
780 *d = '\0';
781
782 return dest;
783 }
784
785
786 /* reverses str of len n; used by BiDi which always knows the len of string to process
787 thus we can save ourselves searching for the 00 */
UT_UCS2_strnrev(UT_UCS2Char * src,UT_uint32 n)788 UT_UCS2Char * UT_UCS2_strnrev(UT_UCS2Char * src, UT_uint32 n)
789 {
790 UT_UCS2Char t;
791 UT_uint32 i;
792
793 for(i = 0; i < n/2; i++)
794 {
795 t = *(src + i);
796 *(src + i) = *(src + n - i - 1); //-1 so that we do not move the 00
797 *(src + n - i - 1) = t;
798 }
799 return src;
800 }
801
802 #endif
803
804
805 ////////////////////////////////////////////////////////////////////////
806 //
807 // UCS string (UT_UCSChar)
808 //
809 // String is built of units based on UT_UCSChar, which used to be
810 // UT_UCS2Char and is now UT_UCS4Char
811 //
812 ////////////////////////////////////////////////////////////////////////
813
UT_isSmartQuotableCharacter(UT_UCSChar c)814 bool UT_isSmartQuotableCharacter(UT_UCSChar c)
815 {
816 // TODO: this is anglo-centric; really need a locale argument or
817 // TODO: something to get smart quote rules for the rest of the world
818 bool result;
819 switch (c)
820 {
821 case '"':
822 case '`':
823 case '\'':
824 result = true;
825 break;
826 default:
827 result = false;
828 break;
829 }
830 return (result);
831 }
832
UT_isSmartQuotedCharacter(UT_UCSChar c)833 bool UT_isSmartQuotedCharacter(UT_UCSChar c)
834 {
835 bool result;
836 switch (c)
837 {
838 case UCS_LQUOTE:
839 case UCS_RQUOTE:
840 case UCS_LDBLQUOTE:
841 case UCS_RDBLQUOTE:
842 case 0x201a:
843 case 0x201e:
844 case 0x2039:
845 case 0x203a:
846 case 0x300c:
847 case 0x300d:
848 case 0x300e:
849 case 0x300f:
850 case '\"':
851 case '\'':
852 result = true;
853 break;
854 default:
855 result = false;
856 break;
857 }
858 return (result);
859 }
860
861 ////////////////////////////////////////////////////////////////////////
862 //
863 // UCS-4 string
864 //
865 // String is built of 32-bit units (longs)
866 //
867 // NOTE: Ambiguity between UCS-2 and UTF-16 above makes no difference
868 // NOTE: in the case of UCS-4 and UTF-32 since they really are
869 // NOTE: identical
870 //
871 ////////////////////////////////////////////////////////////////////////
872
UT_UCS4_isupper(UT_UCS4Char c)873 bool UT_UCS4_isupper(UT_UCS4Char c)
874 {
875 if(c < 127)
876 return isupper(c)!=0;
877
878 case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
879 if(letter && letter->type == 1)
880 return true;
881 return false;
882 }
883
UT_UCS4_islower(UT_UCS4Char c)884 bool UT_UCS4_islower(UT_UCS4Char c)
885 {
886 if(c < 127)
887 return islower(c)!=0;
888
889 case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
890 if(!letter || letter->type == 0)
891 return true;
892 return false;
893 }
894
UT_UCS4_isspace(UT_UCS4Char c)895 bool UT_UCS4_isspace(UT_UCS4Char c)
896 {
897 // the whitespace table is small, so use linear search
898 for (UT_uint32 i = 0; i < G_N_ELEMENTS(whitespace_table); i++)
899 {
900 if(whitespace_table[i].high < c)
901 continue;
902 if(whitespace_table[i].low <= c)
903 return true;
904 // if we got here, then low > c
905 return false;
906 }
907 return false;
908 }
909
UT_UCS4_isalpha(UT_UCS4Char c)910 bool UT_UCS4_isalpha(UT_UCS4Char c)
911 {
912 UT_BidiCharType type = UT_bidiGetCharType(c);
913 return (UT_BIDI_IS_LETTER(type) != 0);
914 }
915
UT_UCS4_isSentenceSeparator(UT_UCS4Char c)916 bool UT_UCS4_isSentenceSeparator(UT_UCS4Char c)
917 {
918 switch(c)
919 {
920 case '?': // fall-through
921 case '!': // fall-through
922 case '.':
923 return true;
924
925 default:
926 return false;
927 }
928 }
929
UT_UCS4_isdigit(UT_UCS4Char c)930 bool UT_UCS4_isdigit(UT_UCS4Char c)
931 {
932 if (c < 0x700) {
933 for (unsigned int i=0; i < G_N_ELEMENTS(digits_table); i++) {
934 if (c < digits_table[i].low) break;
935 if (c <= digits_table[i].high)
936 return true;
937 }
938 } else {
939 ucs_range * rng = static_cast<ucs_range *>(bsearch(&c, &digits_table,
940 G_N_ELEMENTS(digits_table),sizeof(ucs_range),s_cmp_digits));
941 if (rng) return true;
942 }
943 return false;
944 }
945
946 /* copies exactly n-chars from src to dest; NB! does not check for 00 i src
947 */
UT_UCS4_strncpy(UT_UCS4Char * dest,const UT_UCS4Char * src,UT_uint32 n)948 UT_UCS4Char * UT_UCS4_strncpy(UT_UCS4Char * dest, const UT_UCS4Char * src, UT_uint32 n)
949 {
950 UT_ASSERT(dest);
951 UT_ASSERT(src);
952
953 UT_UCSChar * d = dest;
954 const UT_UCSChar * s = static_cast<const UT_UCS4Char *>(src);
955
956 for (; d < static_cast<UT_UCS4Char *>(dest) + n;)
957 *d++ = *s++;
958 *d = '\0';
959
960 return dest;
961 }
962
963
964 /* reverses str of len n; used by BiDi which always knows the len of string to process
965 thus we can save ourselves searching for the 00 */
UT_UCS4_strnrev(UT_UCS4Char * src,UT_uint32 n)966 UT_UCS4Char * UT_UCS4_strnrev(UT_UCS4Char * src, UT_uint32 n)
967 {
968 UT_UCS4Char t;
969 UT_uint32 i;
970
971 for(i = 0; i < n/2; i++)
972 {
973 t = *(src + i);
974 *(src + i) = *(src + n - i - 1); //-1 so that we do not move the 00
975 *(src + n - i - 1) = t;
976 }
977 return src;
978 }
979
980
UT_UCS4_strstr(const UT_UCS4Char * phaystack,const UT_UCS4Char * pneedle)981 UT_UCS4Char * UT_UCS4_strstr(const UT_UCS4Char * phaystack, const UT_UCS4Char * pneedle)
982 {
983 register const UT_UCS4Char *haystack, *needle;
984 register UT_UCS4Char b, c;
985
986 haystack = static_cast<const UT_UCS4Char *>(phaystack);
987 needle = static_cast<const UT_UCS4Char *>(pneedle);
988
989 b = *needle;
990 if (b != '\0')
991 {
992 haystack--; /* possible ANSI violation */
993 do
994 {
995 c = *++haystack;
996 if (c == '\0')
997 goto ret0;
998 }
999 while (c != b);
1000
1001 c = *++needle;
1002 if (c == '\0')
1003 goto foundneedle;
1004 ++needle;
1005 goto jin;
1006
1007 for (;;)
1008 {
1009 register UT_UCS4Char a;
1010 register const UT_UCS4Char *rhaystack, *rneedle;
1011
1012 do
1013 {
1014 a = *++haystack;
1015 if (a == '\0')
1016 goto ret0;
1017 if (a == b)
1018 break;
1019 a = *++haystack;
1020 if (a == '\0')
1021 goto ret0;
1022 shloop: ; // need a statement here for EGCS 1.1.1 to accept it
1023 }
1024 while (a != b);
1025
1026 jin: a = *++haystack;
1027 if (a == '\0')
1028 goto ret0;
1029
1030 if (a != c)
1031 goto shloop;
1032
1033 rhaystack = haystack-- + 1;
1034 rneedle = needle;
1035 a = *rneedle;
1036
1037 if (*rhaystack == a)
1038 do
1039 {
1040 if (a == '\0')
1041 goto foundneedle;
1042 ++rhaystack;
1043 a = *++needle;
1044 if (*rhaystack != a)
1045 break;
1046 if (a == '\0')
1047 goto foundneedle;
1048 ++rhaystack;
1049 a = *++needle;
1050 }
1051 while (*rhaystack == a);
1052
1053 needle = rneedle; /* took the register-poor approach */
1054
1055 if (a == '\0')
1056 break;
1057 }
1058 }
1059 foundneedle:
1060 return const_cast<UT_UCS4Char *>(haystack);
1061 ret0:
1062 return 0;
1063 }
1064
UT_UCS4_strcmp(const UT_UCS4Char * left,const UT_UCS4Char * right)1065 UT_sint32 UT_UCS4_strcmp(const UT_UCS4Char* left, const UT_UCS4Char* right)
1066 {
1067 UT_ASSERT(left);
1068 UT_ASSERT(right);
1069
1070 while (*left && *right)
1071 {
1072 if (*left < *right)
1073 {
1074 return -1;
1075 }
1076
1077 if (*left > *right)
1078 {
1079 return 1;
1080 }
1081
1082 left++;
1083 right++;
1084 }
1085
1086 if (*left)
1087 {
1088 return -1;
1089 }
1090 else if (*right)
1091 {
1092 return 1;
1093 }
1094 else
1095 {
1096 return 0;
1097 }
1098 }
1099
1100 /*
1101 Latin-1 Unicode case-insensitive string comparison and casing done by
1102 Pierre Sarrazin <ps@cam.org>.
1103 */
1104
1105 /**
1106 * Convert a given character to uppercase
1107 */
UT_UCS4_toupper(UT_UCS4Char c)1108 UT_UCS4Char UT_UCS4_toupper(UT_UCS4Char c)
1109 {
1110 if (c < 128) // in ASCII range
1111 return toupper(c);
1112
1113 if (XAP_EncodingManager::get_instance()->single_case())
1114 return c;
1115 /*let's trust libc! -- does not seem to work :(*/
1116 case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
1117 if(!letter || letter->type == 1)
1118 return c;
1119 return letter->other;
1120 }
1121
1122
1123 /* Converts the given character to lowercase if it is an uppercase letter.
1124 Returns it unchanged if it is not.
1125 This function created by Pierre Sarrazin 1999-02-06
1126 */
1127
UT_UCS4_tolower(UT_UCS4Char c)1128 UT_UCS4Char UT_UCS4_tolower(UT_UCS4Char c)
1129 {
1130 if (c < 128)
1131 return tolower(c);
1132
1133 if (XAP_EncodingManager::get_instance()->single_case())
1134 return c;
1135 /*let's trust libc!*/
1136 case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
1137 if(!letter || letter->type == 0)
1138 return c;
1139 return letter->other;
1140 }
1141
1142
1143 /* Characters are converted to lowercase (if applicable) when they
1144 are read from the needle or the haystack. See UT_UCS_tolower().
1145 This function created by Pierre Sarrazin 1999-02-06
1146 */
1147
UT_UCS4_stristr(const UT_UCS4Char * phaystack,const UT_UCS4Char * pneedle)1148 UT_UCS4Char * UT_UCS4_stristr(const UT_UCS4Char * phaystack, const UT_UCS4Char * pneedle)
1149 {
1150 register const UT_UCS4Char *haystack, *needle;
1151 register UT_UCS4Char b, c;
1152
1153 haystack = static_cast<const UT_UCS4Char *>(phaystack);
1154 needle = static_cast<const UT_UCS4Char *>(pneedle);
1155
1156 b = UT_UCS4_tolower(*needle);
1157 if (b != '\0')
1158 {
1159 haystack--; /* possible ANSI violation */
1160 do
1161 {
1162 c = UT_UCS4_tolower(*++haystack);
1163 if (c == '\0')
1164 goto ret0;
1165 }
1166 while (c != b);
1167
1168 c = UT_UCS4_tolower(*++needle);
1169 if (c == '\0')
1170 goto foundneedle;
1171 ++needle;
1172 goto jin;
1173
1174 for (;;)
1175 {
1176 register UT_UCS4Char a;
1177 register const UT_UCS4Char *rhaystack, *rneedle;
1178
1179 do
1180 {
1181 a = UT_UCS4_tolower(*++haystack);
1182 if (a == '\0')
1183 goto ret0;
1184 if (a == b)
1185 break;
1186 a = UT_UCS4_tolower(*++haystack);
1187 if (a == '\0')
1188 goto ret0;
1189 shloop: ; // need a statement here for EGCS 1.1.1 to accept it
1190 }
1191 while (a != b);
1192
1193 jin: a = UT_UCS4_tolower(*++haystack);
1194 if (a == '\0')
1195 goto ret0;
1196
1197 if (a != c)
1198 goto shloop;
1199
1200 rhaystack = haystack-- + 1;
1201 rneedle = needle;
1202 a = UT_UCS4_tolower(*rneedle);
1203
1204 if (UT_UCS4_tolower(*rhaystack) == a)
1205 do
1206 {
1207 if (a == '\0')
1208 goto foundneedle;
1209 ++rhaystack;
1210 a = UT_UCS4_tolower(*++needle);
1211 if (UT_UCS4_tolower(*rhaystack) != a)
1212 break;
1213 if (a == '\0')
1214 goto foundneedle;
1215 ++rhaystack;
1216 a = UT_UCS4_tolower(*++needle);
1217 }
1218 while (UT_UCS4_tolower(*rhaystack) == a);
1219
1220 needle = rneedle; /* took the register-poor approach */
1221
1222 if (a == '\0')
1223 break;
1224 }
1225 }
1226 foundneedle:
1227 return const_cast<UT_UCS4Char *>(haystack);
1228 ret0:
1229 return 0;
1230 }
1231 /****************************************************************************/
1232
UT_UCS4_strlen(const UT_UCS4Char * string)1233 UT_uint32 UT_UCS4_strlen(const UT_UCS4Char * string)
1234 {
1235 UT_uint32 i;
1236
1237 for(i = 0; *string != 0; string++, i++)
1238 ;
1239
1240 return i;
1241 }
1242
UT_UCS4_strlen_as_char(const UT_UCS4Char * string)1243 UT_uint32 UT_UCS4_strlen_as_char(const UT_UCS4Char * string)
1244 {
1245 UT_uint32 i = 0;
1246
1247 char d[4]; // assuming that any character can be coded with no more that 4 bytes.
1248
1249 UT_Wctomb w(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
1250
1251 while (*string != 0)
1252 {
1253 int length;
1254 w.wctomb_or_fallback(d,length,*string++);
1255 i+=length;
1256 }
1257
1258 return i;
1259 }
1260
UT_UCS4_strcpy(UT_UCS4Char * dest,const UT_UCS4Char * src)1261 UT_UCS4Char * UT_UCS4_strcpy(UT_UCS4Char * dest, const UT_UCS4Char * src)
1262 {
1263 UT_ASSERT(dest);
1264 UT_ASSERT(src);
1265
1266 UT_UCS4Char * d = dest;
1267 const UT_UCS4Char * s = static_cast<const UT_UCS4Char *>(src);
1268
1269 while (*s != 0)
1270 *d++ = *s++;
1271 *d = 0;
1272
1273 return dest;
1274 }
1275
1276 // TODO shouldn't all of the 'char *' strings be 'unsigned char *' strings ??
1277
UT_UCS4_strcpy_char(UT_UCS4Char * dest,const char * src)1278 UT_UCS4Char * UT_UCS4_strcpy_char(UT_UCS4Char * dest, const char * src)
1279 {
1280 UT_ASSERT(dest);
1281 UT_ASSERT(src);
1282
1283 UT_UCS4Char * d = dest;
1284 const char * s = static_cast<const char *>(src);
1285
1286 static UT_UCS4_mbtowc m(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
1287 UT_UCS4Char wc;
1288
1289 while (*s != 0)
1290 {
1291 if(m.mbtowc(wc,*s))*d++=wc;
1292 s++;
1293 }
1294 *d = 0;
1295
1296 return dest;
1297 }
1298
UT_UCS4_strncpy_char(UT_UCS4Char * dest,const char * src,int n)1299 UT_UCS4Char * UT_UCS4_strncpy_char(UT_UCS4Char * dest, const char * src, int n)
1300 {
1301 UT_ASSERT(dest);
1302 UT_ASSERT(src);
1303
1304 UT_UCS4Char * d = dest;
1305 const char * s = static_cast<const char *>(src);
1306
1307 static UT_UCS4_mbtowc m(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
1308 UT_UCS4Char wc;
1309
1310 while (*s != 0 && n > 0)
1311 {
1312 if(m.mbtowc(wc,*s))*d++=wc;
1313 s++;
1314 n--;
1315 }
1316 *d = 0;
1317
1318 return dest;
1319 }
1320
UT_UCS4_strcpy_utf8_char(UT_UCS4Char * dest,const char * src)1321 UT_UCS4Char * UT_UCS4_strcpy_utf8_char(UT_UCS4Char * dest, const char * src)
1322 {
1323 // FIXME: This could be more efficient than it is, on the other
1324 // hand, it should be correct
1325
1326 UT_ASSERT(dest);
1327 UT_ASSERT(src);
1328
1329 UT_UCS4String ucs4str(src); // constructs a string from UTF-8 by default
1330 dest = UT_UCS4_strcpy(dest, ucs4str.ucs4_str());
1331
1332 return dest;
1333 }
1334
1335
UT_UCS4_strcpy_to_char(char * dest,const UT_UCS4Char * src)1336 char * UT_UCS4_strcpy_to_char(char * dest, const UT_UCS4Char * src)
1337 {
1338 UT_ASSERT(dest);
1339 UT_ASSERT(src);
1340
1341 char * d = dest;
1342 const UT_UCS4Char * s = static_cast<const UT_UCS4Char *>(src);
1343
1344 UT_Wctomb w(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
1345
1346 while (*s != 0)
1347 {
1348 int length;
1349 w.wctomb_or_fallback(d,length,*s++);
1350 d+=length;
1351 }
1352 *d = 0;
1353
1354 return dest;
1355 }
1356
UT_UCS4_strncpy_to_char(char * dest,const UT_UCS4Char * src,int n)1357 char * UT_UCS4_strncpy_to_char(char * dest, const UT_UCS4Char * src, int n)
1358 {
1359 UT_ASSERT(dest);
1360 UT_ASSERT(src);
1361
1362 char * d = dest;
1363 const UT_UCS4Char * s = static_cast<const UT_UCS4Char *>(src);
1364
1365 UT_Wctomb w(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
1366
1367 while (*s != 0 && n > 0)
1368 {
1369 int length;
1370 w.wctomb_or_fallback(d,length,*s++, n);
1371 d+=length;
1372 n-=length;
1373 }
1374 *d = 0;
1375
1376 return dest;
1377 }
1378
UT_UCS4_cloneString(UT_UCS4Char ** dest,const UT_UCS4Char * src)1379 bool UT_UCS4_cloneString(UT_UCS4Char ** dest, const UT_UCS4Char * src)
1380 {
1381 UT_uint32 length = UT_UCS4_strlen(src) + 1;
1382 *dest = static_cast<UT_UCS4Char *>(UT_calloc(length,sizeof(UT_UCS4Char)));
1383 if (!*dest)
1384 return false;
1385 memmove(*dest,src,length*sizeof(UT_UCS4Char));
1386
1387 return true;
1388 }
1389
UT_UCS4_cloneString_char(UT_UCS4Char ** dest,const char * src)1390 bool UT_UCS4_cloneString_char(UT_UCS4Char ** dest, const char * src)
1391 {
1392 UT_uint32 length = strlen(src) + 1;
1393 *dest = static_cast<UT_UCS4Char *>(UT_calloc(length,sizeof(UT_UCS4Char)));
1394 if (!*dest)
1395 return false;
1396 UT_UCS4_strcpy_char(*dest, src);
1397
1398 return true;
1399 }
1400
s_pass_name(const char * & csstr,char end)1401 static const char * s_pass_name (const char *& csstr, char end)
1402 {
1403 const char * name_end = csstr;
1404
1405 while (*csstr)
1406 {
1407 unsigned char u = static_cast<unsigned char>(*csstr);
1408 if (u & 0x80)
1409 {
1410 UT_UTF8Stringbuf::UCS4Char ucs4 = UT_UTF8Stringbuf::charCode (csstr);
1411 if (UT_UCS4_isspace (ucs4))
1412 {
1413 name_end = csstr;
1414 break;
1415 }
1416 while (static_cast<unsigned char>(*++csstr) & 0x80)
1417 ;
1418 continue;
1419 }
1420 else if ((isspace (static_cast<int>(u))) || (*csstr == end))
1421 {
1422 name_end = csstr;
1423 break;
1424 }
1425 csstr++;
1426 }
1427 return name_end;
1428 }
1429
1430
s_pass_value(const char * & csstr)1431 static const char * s_pass_value (const char *& csstr)
1432 {
1433 const char * value_end = csstr;
1434
1435 bool bQuoted = false;
1436 while (*csstr)
1437 {
1438 bool bSpace = false;
1439 unsigned char u = static_cast<unsigned char>(*csstr);
1440 if (u & 0x80)
1441 {
1442 UT_UTF8Stringbuf::UCS4Char ucs4 = UT_UTF8Stringbuf::charCode (csstr);
1443
1444 if (!bQuoted)
1445 if (UT_UCS4_isspace (ucs4))
1446 {
1447 bSpace = true;
1448 break;
1449 }
1450 while (static_cast<unsigned char>(*++csstr) & 0x80)
1451 ;
1452 if (!bSpace)
1453 value_end = csstr;
1454 continue;
1455 }
1456 else if ((*csstr == '\'') || (*csstr == '"'))
1457 {
1458 bQuoted = (bQuoted ? false : true);
1459 }
1460 else if (*csstr == ';')
1461 {
1462 if (!bQuoted)
1463 {
1464 csstr++;
1465 break;
1466 }
1467 }
1468 else if (!bQuoted && isspace (static_cast<int>(u)))
1469 bSpace = true;
1470
1471 csstr++;
1472 if (!bSpace)
1473 value_end = csstr;
1474 }
1475 return value_end;
1476 }
1477
1478
s_pass_string(const char * & csstr_ptr)1479 static const char * s_pass_string (const char *& csstr_ptr)
1480 {
1481 if (*csstr_ptr == 0)
1482 return 0;
1483
1484 const char * csstr = csstr_ptr;
1485
1486 char quote = 0;
1487
1488 if ((*csstr == '\'') || (*csstr == '"'))
1489 quote = *csstr;
1490
1491 bool valid = true;
1492 bool skip = false;
1493
1494 while (true)
1495 {
1496 unsigned char u = static_cast<unsigned char>(*++csstr);
1497
1498 if ((u & 0xc0) == 0x80)
1499 continue; // trailing byte
1500 if (u == 0)
1501 {
1502 valid = false;
1503 break;
1504 }
1505 if (skip)
1506 {
1507 skip = false;
1508 continue;
1509 }
1510 if (*csstr == quote)
1511 {
1512 ++csstr;
1513 break;
1514 }
1515 if (*csstr == '\\')
1516 skip = true;
1517 }
1518 if (valid)
1519 {
1520 csstr_ptr = csstr;
1521 csstr--;
1522 }
1523 else
1524 {
1525 csstr = csstr_ptr;
1526 }
1527 return csstr; // points to end quote on success, and to start quote on failure
1528 }
1529
s_pass_whitespace(const char * & csstr)1530 static void s_pass_whitespace (const char *& csstr)
1531 {
1532 while (*csstr)
1533 {
1534 unsigned char u = static_cast<unsigned char>(*csstr);
1535 if (u & 0x80)
1536 {
1537 UT_UTF8Stringbuf::UCS4Char ucs4 = UT_UTF8Stringbuf::charCode (csstr);
1538 if (UT_UCS4_isspace (ucs4))
1539 {
1540 while (static_cast<unsigned char>(*++csstr) & 0x80)
1541 ;
1542 continue;
1543 }
1544 }
1545 else if (isspace (static_cast<int>(u)))
1546 {
1547 csstr++;
1548 continue;
1549 }
1550 break;
1551 }
1552 }
1553
1554
UT_parse_attributes(const char * attributes,std::map<std::string,std::string> & map)1555 void UT_parse_attributes(const char * attributes,
1556 std::map<std::string, std::string> & map)
1557 {
1558 if ( attributes == 0)
1559 return;
1560 if (*attributes == 0)
1561 return;
1562
1563 const char * atstr = attributes;
1564
1565 std::string name;
1566 std::string value;
1567
1568 while (*atstr)
1569 {
1570 s_pass_whitespace (atstr);
1571
1572 const char * name_start = atstr;
1573 const char * name_end = s_pass_name (atstr, '=');
1574
1575 if (*atstr != '=')
1576 break; // whatever we have, it's not a name="value" pair
1577 if (name_start == name_end)
1578 break; // ?? stray equals?
1579
1580 name.clear();
1581 std::copy(name_start, name_end, name.begin());
1582
1583 atstr++;
1584
1585 if ((*atstr != '\'') && (*atstr != '"'))
1586 break; // whatever we have, it's not a name="value" pair
1587
1588 const char * value_start = atstr;
1589 const char * value_end = s_pass_string (atstr);
1590
1591 if (value_start == value_end)
1592 break; // ?? no value...
1593
1594 value_start++;
1595
1596 value.clear();
1597 std::copy(value_start, value_end, value.begin());
1598
1599 map[name] = value;
1600 }
1601 }
1602
1603
UT_parse_properties(const char * properties,std::map<std::string,std::string> & map)1604 void UT_parse_properties(const char * properties,
1605 std::map<std::string, std::string> & map)
1606 {
1607 if ( properties == 0)
1608 return;
1609 if (*properties == 0)
1610 return;
1611
1612 const char * csstr = properties;
1613
1614 std::string name;
1615 std::string value;
1616
1617 bool bSkip = false;
1618
1619 while (*csstr)
1620 {
1621 if (bSkip)
1622 {
1623 if (*csstr == ';')
1624 bSkip = false;
1625 ++csstr;
1626 continue;
1627 }
1628 s_pass_whitespace (csstr);
1629
1630 const char * name_start = csstr;
1631 const char * name_end = s_pass_name (csstr, ':');
1632
1633 if (*csstr == 0) break; // whatever we have, it's not a "name:value;" pair
1634 if (name_start == name_end) // ?? stray colon?
1635 {
1636 bSkip = true;
1637 continue;
1638 }
1639 name.resize(name_end - name_start);
1640 std::copy(name_start, name_end, name.begin());
1641
1642 s_pass_whitespace (csstr);
1643 if (*csstr != ':') // whatever we have, it's not a "name:value;" pair
1644 {
1645 bSkip = true;
1646 continue;
1647 }
1648
1649 csstr++;
1650 s_pass_whitespace (csstr);
1651
1652 if (*csstr == 0)
1653 break; // whatever we have, it's not a "name:value;" pair
1654
1655 const char * value_start = csstr;
1656 const char * value_end = s_pass_value (csstr);
1657
1658 if (value_start == value_end) // ?? no value...
1659 {
1660 bSkip = true;
1661 continue;
1662 }
1663 value.resize(value_end - value_start);
1664 std::copy(value_start, value_end, value.begin());
1665
1666 map[name] = value;
1667 }
1668 }
1669
1670 /*
1671 this one prints floating point value but using dot as fractional separator
1672 independent of the current locale's settings.
1673 */
std_size_string(float f)1674 const char* std_size_string(float f)
1675 {
1676 static char string[24];
1677 int i=static_cast<int>(f);
1678 if(f-i<0.1) {
1679 sprintf(string, "%d", i);
1680 } else {
1681 int fr = int(10*(f-i));
1682 sprintf(string,"%d.%d", i, fr);
1683 }
1684 return string;
1685 }
1686
1687 #ifndef TOOLKIT_WIN
1688
UT_bidiGetCharType(UT_UCS4Char c)1689 UT_BidiCharType UT_bidiGetCharType(UT_UCS4Char c)
1690 {
1691 #ifndef NO_BIDI_SUPPORT
1692 return fribidi_get_type(c);
1693 #else
1694 return UT_BIDI_LTR;
1695 #endif
1696 }
1697
1698 /*!
1699 pStrOut needs to contain space for len characters + terminating 0
1700 */
UT_bidiReorderString(const UT_UCS4Char * pStrIn,UT_uint32 len,UT_BidiCharType baseDir,UT_UCS4Char * pStrOut)1701 bool UT_bidiReorderString(const UT_UCS4Char * pStrIn, UT_uint32 len, UT_BidiCharType baseDir,
1702 UT_UCS4Char * pStrOut)
1703 {
1704 UT_return_val_if_fail( pStrIn && pStrOut, false );
1705
1706 #ifndef NO_BIDI_SUPPORT
1707 // this works around 8685; this should be left here, in fact any decent optimising
1708 // compiler should remove this code if the bug does not exist
1709 if(sizeof(FriBidiChar) > sizeof(UT_UCS4Char))
1710 {
1711 static FriBidiChar* pFBDC = NULL;
1712 static FriBidiChar* pFBDC2 = NULL;
1713 static UT_uint32 iFBDlen = 0;
1714
1715 if(iFBDlen < len + 1)
1716 {
1717 delete [] pFBDC; delete [] pFBDC2;
1718 iFBDlen = 0;
1719
1720 pFBDC = new FriBidiChar [len + 1];
1721 pFBDC2 = new FriBidiChar [len + 1];
1722
1723 UT_return_val_if_fail( pFBDC && pFBDC2, false );
1724
1725 iFBDlen = len + 1;
1726 }
1727
1728 UT_uint32 i;
1729 for(i = 0; i < len; ++i)
1730 {
1731 pFBDC[i] = (FriBidiChar) pStrIn[i];
1732 }
1733
1734 pFBDC[i] = 0;
1735
1736 int iRet = fribidi_log2vis (pFBDC, len, &baseDir, pFBDC2, NULL, NULL, NULL);
1737
1738 for(i = 0; i < len; ++i)
1739 {
1740 pStrOut[i] = (UT_UCS4Char) pFBDC2[i];
1741 }
1742
1743 pStrOut[i] = 0;
1744
1745 return iRet;
1746 }
1747 else
1748 {
1749 return (0 != fribidi_log2vis ((FriBidiChar *)pStrIn, len, &baseDir, (FriBidiChar*)pStrOut, NULL, NULL, NULL));
1750 }
1751
1752 #else
1753 if(!pStrIn || !*pStrIn)
1754 return true;
1755
1756 UT_return_val_if_fail( pStrOut, false );
1757
1758 UT_UCS4_strncpy(pStrOut, pStrIn, len);
1759 return true;
1760 #endif
1761 }
1762
UT_bidiMapLog2Vis(const UT_UCS4Char * pStrIn,UT_uint32 len,UT_BidiCharType baseDir,UT_uint32 * pL2V,UT_uint32 * pV2L,UT_Byte * pEmbed)1763 bool UT_bidiMapLog2Vis(const UT_UCS4Char * pStrIn, UT_uint32 len, UT_BidiCharType baseDir,
1764 UT_uint32 *pL2V, UT_uint32 * pV2L, UT_Byte * pEmbed)
1765 {
1766 #ifndef NO_BIDI_SUPPORT
1767 // if this assert fails, we have a serious problem ...
1768 UT_ASSERT_HARMLESS( sizeof(UT_UCS4Char) == sizeof(FriBidiChar) );
1769 return (0 != fribidi_log2vis ((FriBidiChar *)pStrIn, len, &baseDir,
1770 NULL, (FriBidiStrIndex*)pL2V, (FriBidiStrIndex*)pV2L, (FriBidiLevel*)pEmbed));
1771 #else
1772 UT_return_val_if_fail( pL2V && pV2L && pEmbed, false );
1773 for(UT_uint32 i = 0; i < len; ++i)
1774 {
1775 pL2V[i] = i;
1776 pV2L[i] = i;
1777 pEmbed[i] = 0;
1778 }
1779
1780 return true;
1781 #endif
1782 }
1783
UT_bidiGetMirrorChar(UT_UCS4Char c,UT_UCS4Char & mc)1784 bool UT_bidiGetMirrorChar(UT_UCS4Char c, UT_UCS4Char &mc)
1785 {
1786 #ifndef NO_BIDI_SUPPORT
1787 return (0 != fribidi_get_mirror_char(c, (FriBidiChar*)&mc));
1788 #else
1789 return false;
1790 #endif
1791 }
1792
1793
1794 #endif
1795