1 /* AbiSource Program Utilities
2  * Copyright (C) 1998 AbiSource, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17  * 02110-1301 USA.
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <algorithm>
28 #include <math.h>
29 #include <ctype.h>
30 
31 #include "ut_types.h"
32 #include "ut_misc.h"
33 #include "ut_assert.h"
34 #include "ut_string.h"
35 #include "ut_debugmsg.h"
36 #include "ut_growbuf.h"
37 #include <fribidi.h>
38 #include "ut_mbtowc.h"
39 #include "ut_wctomb.h"
40 
41 #include "ut_string_class.h"
42 
43 #include "xap_EncodingManager.h"
44 
45 #define UT_STRING_CPP
46 #include "ut_case.h"
47 #undef  UT_STRING_CPP
48 
UT_XML_cloneNoAmpersands(gchar * & rszDest,const gchar * szSource)49 bool UT_XML_cloneNoAmpersands(gchar *& rszDest, const gchar * szSource)
50 {
51 	if (szSource == NULL)
52 		return false;
53 
54 	UT_uint32 length = strlen(szSource) + 1;
55 	rszDest = static_cast<gchar *>(UT_calloc(length, sizeof(gchar)));
56 
57 	if (!rszDest)
58 		return false;
59 
60 	const gchar * o = szSource;
61 	gchar * n = rszDest;
62 	while (*o != 0)
63 	{
64 		if (*o != '&')
65 		{
66 			*n = *o;
67 			n++;
68 		}
69 		o++;
70 	}
71 
72 	return true;
73 }
74 
UT_XML_cloneConvAmpersands(gchar * & rszDest,const gchar * szSource)75 bool UT_XML_cloneConvAmpersands(gchar *& rszDest, const gchar * szSource)
76 {
77 	if (szSource == NULL)
78 		return false;
79 
80 	UT_uint32 length = strlen(szSource) + 1;
81 	rszDest = static_cast<gchar *>(UT_calloc(length, sizeof(gchar)));
82 
83 	if (!rszDest)
84 		return false;
85 
86 	const gchar * o = szSource;
87 	gchar * n = rszDest;
88 	while (*o != 0)
89 	{
90 		if (*o != '&')
91 		{
92 			*n = *o;
93 		} else {
94 			if (o[1] == '&') {
95 				*n++ = '&';
96 			}
97 			else *n = '_';
98 		}
99 		n++; o++;
100 	}
101 
102 	return true;
103 }
104 
105 /* This uses the clone no ampersands but dumps into a static buffer */
UT_XML_transNoAmpersands(const gchar * szSource)106 const gchar *UT_XML_transNoAmpersands(const gchar * szSource)
107 {
108 	static gchar *rszDestBuffer = NULL;
109 	static UT_uint32 iDestBufferLength = 0;
110 
111 	if (szSource == NULL)
112 		return NULL;
113 
114 	UT_uint32 length = strlen(szSource) + 1;
115 	if (length > iDestBufferLength) {
116 		if (rszDestBuffer && iDestBufferLength) {
117 			g_free(rszDestBuffer);
118 		}
119 		iDestBufferLength = 0;
120 		rszDestBuffer = static_cast<gchar *>(UT_calloc(length, sizeof(gchar)));
121 
122 		if (!rszDestBuffer)
123 			return NULL;
124 
125 		iDestBufferLength = length;
126 	}
127 	memset(rszDestBuffer, 0, iDestBufferLength);
128 
129 	const gchar * o = szSource;
130 	gchar * n = rszDestBuffer;
131 	while (*o != 0)
132 	{
133 		if (*o != '&')
134 		{
135 			*n = *o;
136 			n++;
137 		}
138 		o++;
139 	}
140 
141 	return rszDestBuffer;
142 }
143 
144 /*! \fn bool UT_isValidXML(const char *s)
145 	 \param s The string of characters which is to be checked for XML-validity.
146 	 \retval TRUE if the characters are all valid for XML, FALSE if any one of them is not.
147 
148 	 NB: this function also checks that the string is valid utf-8
149 */
UT_isValidXML(const char * pString)150 bool UT_isValidXML(const char *pString)
151 {
152 	if(!pString)
153 		return true;
154 
155 	if(!g_utf8_validate(pString, -1, NULL))
156 		return false;
157 
158 	const UT_Byte * s = reinterpret_cast<const UT_Byte *>(pString);
159 
160 	while(*s)
161 	{
162 		if(*s < ' ' && *s != '\t' && *s != '\n' && *s != '\r')
163 		{
164 			return false;
165 		}
166 
167 		++s;
168 	}
169 
170 	return true;
171 }
172 
173 /*!
174     XML cannot contain any control characters except \t, \n, \r, see bug 8565
175     (http://www.w3.org/TR/REC-xml/#charsets)
176 
177     This function removes any illegal characters and invalid utf-8 sequences.
178 
179     The return value of true indicates that the string was modified
180 */
UT_validXML(char * pString)181 bool UT_validXML(char * pString)
182 {
183 	if(!pString)
184 		return false;
185 
186 	UT_ASSERT(sizeof(gchar) == sizeof(UT_Byte));
187 	const UT_Byte * p = reinterpret_cast<const UT_Byte *>(pString);	// gchar is signed...
188 
189 	bool bChanged = false;
190 	UT_uint32 len = strlen(pString);
191 
192 	int bytesInSequence = 0;
193 	int bytesExpectedInSequence = 0;
194 
195 	UT_String s;
196 	s.reserve(len);
197 
198 	for (UT_uint32 k=0; k<len; k++)
199 	{
200 		if (p[k] < 0x80)						// plain us-ascii part of latin-1
201 		{
202 			if(bytesInSequence != 0)
203 				bChanged = true;
204 
205 			// UT_Byte is unsigned char, hence p[k] always >= 0
206 			if(p[k] < ' ' /*&& p[k] >= 0*/ && p[k] != '\t' && p[k] != '\n' && p[k] != '\r')
207 			{
208 				bChanged = true;
209 			}
210 			else
211 				s += p[k];
212 
213 			bytesInSequence = 0;
214 			bytesExpectedInSequence = 0;
215 		}
216 		else if ((p[k] & 0xf0) == 0xf0)			// lead byte in 4-byte surrogate pair
217 		{
218 			if(bytesInSequence != 0)
219 				bChanged = true;
220 
221 			UT_ASSERT_HARMLESS( UT_NOT_IMPLEMENTED );
222 			bytesExpectedInSequence = 4;
223 			bytesInSequence = 1;
224 		}
225 		else if ((p[k] & 0xe0) == 0xe0)			// lead byte in 3-byte sequence
226 		{
227 			if(bytesInSequence != 0)
228 				bChanged = true;
229 
230 			bytesExpectedInSequence = 3;
231 			bytesInSequence = 1;
232 		}
233 		else if ((p[k] & 0xc0) == 0xc0)			// lead byte in 2-byte sequence
234 		{
235 			if(bytesInSequence != 0)
236 				bChanged = true;
237 
238 			bytesExpectedInSequence = 2;
239 			bytesInSequence = 1;
240 		}
241 		else if ((p[k] & 0x80) == 0x80)			// trailing byte in multi-byte sequence
242 		{
243 			bytesInSequence++;
244 			if (bytesInSequence == bytesExpectedInSequence)		// final byte in multi-byte sequence
245 			{
246 				for(UT_sint32 i = k - bytesInSequence + 1; i <= (UT_sint32)k; i++)
247 				{
248 					s += p[i];
249 				}
250 
251 				bytesInSequence = 0;
252 				bytesExpectedInSequence = 0;
253 			}
254 		}
255 	}
256 
257 	strncpy(pString, s.c_str(), s.length());
258 
259 	// make sure we null-terminate
260 	pString[s.length()] = 0;
261 	return bChanged;
262 }
263 
UT_decodeUTF8string(const gchar * pString,UT_uint32 len,UT_GrowBuf * pResult)264 void UT_decodeUTF8string(const gchar * pString, UT_uint32 len, UT_GrowBuf * pResult)
265 {
266 	// decode the given string [ p[0]...p[len] ] and append to the given growbuf.
267 
268 	UT_ASSERT(sizeof(gchar) == sizeof(UT_Byte));
269 	const UT_Byte * p = reinterpret_cast<const UT_Byte *>(pString);	// gchar is signed...
270 
271 	int bytesInSequence = 0;
272 	int bytesExpectedInSequence = 0;
273 	gchar buf[5];
274 
275 	for (UT_uint32 k=0; k<len; k++)
276 	{
277 		if (p[k] < 0x80)						// plain us-ascii part of latin-1
278 		{
279 			UT_ASSERT(bytesInSequence == 0);
280 			UT_UCSChar c = p[k];
281 			pResult->append(reinterpret_cast<UT_GrowBufElement *>(&c),1);
282 		}
283 		else if ((p[k] & 0xf0) == 0xf0)			// lead byte in 4-byte surrogate pair
284 		{
285 			// surrogate pairs are defined in section 3.7 of the
286 			// unicode standard version 2.0 as an extension
287 			// mechanism for rare characters in future extensions
288 			// of the unicode standard.
289 			UT_ASSERT(bytesInSequence == 0);
290 			bytesExpectedInSequence = 4;
291 			buf[bytesInSequence++] = p[k];
292 		}
293 		else if ((p[k] & 0xe0) == 0xe0)			// lead byte in 3-byte sequence
294 		{
295 			UT_ASSERT(bytesInSequence == 0);
296 			bytesExpectedInSequence = 3;
297 			buf[bytesInSequence++] = p[k];
298 		}
299 		else if ((p[k] & 0xc0) == 0xc0)			// lead byte in 2-byte sequence
300 		{
301 			UT_ASSERT(bytesInSequence == 0);
302 			bytesExpectedInSequence = 2;
303 			buf[bytesInSequence++] = p[k];
304 		}
305 		else if ((p[k] & 0x80) == 0x80)			// trailing byte in multi-byte sequence
306 		{
307 			UT_ASSERT(bytesInSequence > 0);
308 			buf[bytesInSequence++] = p[k];
309 			if (bytesInSequence == bytesExpectedInSequence)		// final byte in multi-byte sequence
310 			{
311 				UT_UCSChar c = g_utf8_get_char(buf);
312 				pResult->append(reinterpret_cast<UT_GrowBufElement *>(&c),1);
313 				bytesInSequence = 0;
314 				bytesExpectedInSequence = 0;
315 			}
316 		}
317 	}
318 }
319 
320 /*
321   The following code is from the GNU C library, version 2.0.6.
322   It has been reformatted and tweaked to do Unicode strstrs.
323   All this licensing stuff is kinda ugly, but I didn't want
324   to risk merging the licensing for fear I might break some law.
325 */
326 
327 /* Copyright (C) 1994, 1996 Free Software Foundation, Inc.
328    This file is part of the GNU C Library.
329 
330    The GNU C Library is free software; you can redistribute it and/or
331    modify it under the terms of the GNU Library General Public License as
332    published by the Free Software Foundation; either version 2 of the
333    License, or (at your option) any later version.
334 
335    The GNU C Library is distributed in the hope that it will be useful,
336    but WITHOUT ANY WARRANTY; without even the implied warranty of
337    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
338    Library General Public License for more details.
339 
340    You should have received a copy of the GNU Library General Public
341    License along with the GNU C Library; see the file COPYING.LIB.  If not,
342    write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
343    Boston, MA 02110-1301 USA.  */
344 
345 ////////////////////////////////////////////////////////////////////////
346 //
347 //  UCS-2 string (UT_UCS2Char)
348 //
349 //  String is built of 16-bit units (words)
350 //
351 //  TODO: Is this really UCS-2 or UTF-16?
352 //  TODO:  meaning, does it support surrogates or is it intended to
353 //  TODO:  support them at any time in the future?
354 //  TODO: Correctly, UCS-2 does not support surrogates and UTF-16 does.
355 //  TODO: BUT Microsoft calls their native Unicode encoding UCS-2
356 //  TODO:  while it supports surrogates and is thus really UTF-16.
357 //  TODO: Surrogates are Unicode characters with codepoints above
358 //  TODO:  65535 which cannot therefore fit into a 2-byte word.
359 //  TODO: This means that TRUE UCS-2 is a single-word encoding and
360 //  TODO:  UTF-16 is a multi-word encoding.
361 //
362 //  NOTE: We shouldn't actually need 16-bit strings anymore since
363 //  NOTE:  AbiWord is now fully converted to using 32-bit Unicode
364 //  NOTE:  internally. The only possible needs for this is for
365 //  NOTE:  Windows GUI, filesystem and API functions where applicable;
366 //  NOTE:  and perhaps some file formats or external libraries
367 //
368 ////////////////////////////////////////////////////////////////////////
369 
370 // Don't ifdef out strlen since it's used by the MSWord importer...
371 
372 // TODO is this really UCS-2 or UTF-16?
373 // TODO and are we using strlen for the number of 16-bit words
374 // TODO or the number of characters?
375 // TODO Because UTF-16 characters are sometimes expressed as 2 words
376 
UT_UCS2_strlen(const UT_UCS2Char * string)377 UT_uint32 UT_UCS2_strlen(const UT_UCS2Char * string)
378 {
379 	UT_uint32 i;
380 
381 	for(i = 0; *string != 0; string++, i++)
382 		;
383 
384 	return i;
385 }
386 
387 #ifdef ENABLE_UCS2_STRINGS
388 /*
389  * My personal strstr() implementation that beats most other algorithms.
390  * Until someone tells me otherwise, I assume that this is the
391  * fastest implementation of strstr() in C.
392  * I deliberately chose not to comment it.  You should have at least
393  * as much fun trying to understand it, as I had to write it :-).
394  *
395  * Stephen R. van den Berg, berg@pool.informatik.rwth-aachen.de */
396 
UT_UCS2_strstr(const UT_UCS2Char * phaystack,const UT_UCS2Char * pneedle)397 UT_UCS2Char * UT_UCS2_strstr(const UT_UCS2Char * phaystack, const UT_UCS2Char * pneedle)
398 {
399 	register const UT_UCS2Char *haystack, *needle;
400 	register UT_UCS2Char b, c;
401 
402 	haystack = phaystack;
403 	needle = pneedle;
404 
405 	b = *needle;
406 	if (b != '\0')
407     {
408 		haystack--;                               /* possible ANSI violation */
409 		do
410         {
411 			c = *++haystack;
412 			if (c == '\0')
413 				goto ret0;
414         }
415 		while (c != b);
416 
417 		c = *++needle;
418 		if (c == '\0')
419 			goto foundneedle;
420 		++needle;
421 		goto jin;
422 
423 		for (;;)
424         {
425 			register UT_UCS2Char a;
426 			register const UT_UCS2Char *rhaystack, *rneedle;
427 
428 			do
429             {
430 				a = *++haystack;
431 				if (a == '\0')
432 					goto ret0;
433 				if (a == b)
434 					break;
435 				a = *++haystack;
436 				if (a == '\0')
437 					goto ret0;
438 			shloop: ; // need a statement here for EGCS 1.1.1 to accept it
439 			}
440 			while (a != b);
441 
442 		jin:	a = *++haystack;
443 			if (a == '\0')
444 				goto ret0;
445 
446 			if (a != c)
447 				goto shloop;
448 
449 			rhaystack = haystack-- + 1;
450 			rneedle = needle;
451 			a = *rneedle;
452 
453 			if (*rhaystack == a)
454 				do
455 				{
456 					if (a == '\0')
457 						goto foundneedle;
458 					++rhaystack;
459 					a = *++needle;
460 					if (*rhaystack != a)
461 						break;
462 					if (a == '\0')
463 						goto foundneedle;
464 					++rhaystack;
465 					a = *++needle;
466 				}
467 				while (*rhaystack == a);
468 
469 			needle = rneedle;             /* took the register-poor approach */
470 
471 			if (a == '\0')
472 				break;
473         }
474     }
475  foundneedle:
476 	return static_cast<UT_UCS2Char *>(haystack);
477  ret0:
478 	return 0;
479 }
480 
UT_UCS2_strcmp(const UT_UCS2Char * left,const UT_UCS2Char * right)481 UT_sint32 UT_UCS2_strcmp(const UT_UCS2Char* left, const UT_UCS2Char* right)
482 {
483 	UT_ASSERT(left);
484 	UT_ASSERT(right);
485 
486 	while (*left && *right)
487 	{
488 		if (*left < *right)
489 		{
490 			return -1;
491 		}
492 
493 		if (*left > *right)
494 		{
495 			return 1;
496 		}
497 
498 		left++;
499 		right++;
500 	}
501 
502 	if (*left)
503 	{
504 		return -1;
505 	}
506 	else if (*right)
507 	{
508 		return 1;
509 	}
510 	else
511 	{
512 		return 0;
513 	}
514 }
515 
516 /*
517   Latin-1 Unicode case-insensitive string comparison and casing done by
518   Pierre Sarrazin <ps@cam.org>.
519 */
520 
521 /**
522  * Convert a given character to uppercase
523  */
UT_UCS2_toupper(UT_UCS2Char c)524 UT_UCS2Char UT_UCS2_toupper(UT_UCS2Char c)
525 {
526         if (c < 128) // in ASCII range
527 	  return toupper(c);
528 
529 	if (XAP_EncodingManager::get_instance()->single_case())
530 		return c;
531 	/*let's trust libc! -- does not seem to work :(*/
532     case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
533     if(!letter || letter->type == 1)
534         return c;
535     return letter->other;
536 }
537 
538 
539 /*	Converts the given character to lowercase if it is an uppercase letter.
540 	Returns it unchanged if it is not.
541 	This function created by Pierre Sarrazin 1999-02-06
542 */
543 
UT_UCS2_tolower(UT_UCS2Char c)544 UT_UCS2Char UT_UCS2_tolower(UT_UCS2Char c)
545 {
546 	if (c < 128)
547 		return tolower(c);
548 	if (XAP_EncodingManager::get_instance()->single_case())
549 		return c;
550 	/*let's trust libc!*/
551     case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
552     if(!letter || letter->type == 0)
553         return c;
554     return letter->other;
555 }
556 
557 
558 /*	Characters are converted to lowercase (if applicable) when they
559 	are read from the needle or the haystack. See UT_UCS_tolower().
560 	This function created by Pierre Sarrazin 1999-02-06
561 */
562 
UT_UCS2_stristr(const UT_UCS2Char * phaystack,const UT_UCS2Char * pneedle)563 UT_UCS2Char * UT_UCS2_stristr(const UT_UCS2Char * phaystack, const UT_UCS2Char * pneedle)
564 {
565 	register const UT_UCS2Char *haystack, *needle;
566 	register UT_UCS2Char b, c;
567 
568 	haystack = phaystack;
569 	needle = pneedle;
570 
571 	b = UT_UCS2_tolower(*needle);
572 	if (b != '\0')
573     {
574 		haystack--;                               /* possible ANSI violation */
575 		do
576         {
577 			c = UT_UCS2_tolower(*++haystack);
578 			if (c == '\0')
579 				goto ret0;
580         }
581 		while (c != b);
582 
583 		c = UT_UCS2_tolower(*++needle);
584 		if (c == '\0')
585 			goto foundneedle;
586 		++needle;
587 		goto jin;
588 
589 		for (;;)
590         {
591 			register UT_UCS2Char a;
592 			register const UT_UCS2Char *rhaystack, *rneedle;
593 
594 			do
595             {
596 				a = UT_UCS2_tolower(*++haystack);
597 				if (a == '\0')
598 					goto ret0;
599 				if (a == b)
600 					break;
601 				a = UT_UCS2_tolower(*++haystack);
602 				if (a == '\0')
603 					goto ret0;
604 			shloop: ; // need a statement here for EGCS 1.1.1 to accept it
605 			}
606 			while (a != b);
607 
608 		jin:	a = UT_UCS2_tolower(*++haystack);
609 			if (a == '\0')
610 				goto ret0;
611 
612 			if (a != c)
613 				goto shloop;
614 
615 			rhaystack = haystack-- + 1;
616 			rneedle = needle;
617 			a = UT_UCS2_tolower(*rneedle);
618 
619 			if (UT_UCS2_tolower(*rhaystack) == a)
620 				do
621 				{
622 					if (a == '\0')
623 						goto foundneedle;
624 					++rhaystack;
625 					a = UT_UCS2_tolower(*++needle);
626 					if (UT_UCS2_tolower(*rhaystack) != a)
627 						break;
628 					if (a == '\0')
629 						goto foundneedle;
630 					++rhaystack;
631 					a = UT_UCS2_tolower(*++needle);
632 				}
633 				while (UT_UCS2_tolower(*rhaystack) == a);
634 
635 			needle = rneedle;             /* took the register-poor approach */
636 
637 			if (a == '\0')
638 				break;
639         }
640     }
641  foundneedle:
642 	return static_cast<UT_UCS2Char *>(haystack);
643  ret0:
644 	return 0;
645 }
646 /****************************************************************************/
647 
UT_UCS2_strcpy(UT_UCS2Char * dest,const UT_UCS2Char * src)648 UT_UCS2Char * UT_UCS2_strcpy(UT_UCS2Char * dest, const UT_UCS2Char * src)
649 {
650 	UT_ASSERT(dest);
651 	UT_ASSERT(src);
652 
653 	UT_UCS2Char * d = dest;
654 	UT_UCS2Char * s = static_cast<UT_UCS2Char *>(src);
655 
656 	while (*s != 0)
657 		*d++ = *s++;
658 	*d = 0;
659 
660 	return dest;
661 }
662 
UT_UCS2_strcpy_char(UT_UCS2Char * dest,const char * src)663 UT_UCS2Char * UT_UCS2_strcpy_char(UT_UCS2Char * dest, const char * src)
664 {
665 	UT_ASSERT(dest);
666 	UT_ASSERT(src);
667 
668 	UT_UCS2Char * d 		= dest;
669 	unsigned char * s	= static_cast<unsigned char *>(src);
670 
671 	static UT_UCS2_mbtowc m(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
672 	UT_UCS2Char wc;
673 
674 	while (*s != 0)
675 	  {
676 		if(m.mbtowc(wc,*s))*d++=wc;
677 		s++;
678 	  }
679 	*d = 0;
680 
681 	return dest;
682 }
683 
UT_UCS2_strcpy_to_char(char * dest,const UT_UCS2Char * src)684 char * UT_UCS2_strcpy_to_char(char * dest, const UT_UCS2Char * src)
685 {
686 	UT_ASSERT(dest);
687 	UT_ASSERT(src);
688 
689 	UT_ASSERT_NOT_REACHED();
690 
691 	return NULL;
692 }
693 
UT_UCS2_cloneString(UT_UCS2Char ** dest,const UT_UCS2Char * src)694 bool UT_UCS2_cloneString(UT_UCS2Char ** dest, const UT_UCS2Char * src)
695 {
696 	UT_uint32 length = UT_UCS2_strlen(src) + 1;
697 	*dest = static_cast<UT_UCS2Char *>(UT_calloc(length,sizeof(UT_UCS2Char)));
698 	if (!*dest)
699 		return false;
700 	memmove(*dest,src,length*sizeof(UT_UCS2Char));
701 
702 	return true;
703 }
704 
UT_UCS2_cloneString_char(UT_UCS2Char ** dest,const char * src)705 bool UT_UCS2_cloneString_char(UT_UCS2Char ** dest, const char * src)
706 {
707   UT_ASSERT_NOT_REACHED();
708   return false;
709 }
710 
UT_UCS2_isupper(UT_UCS2Char c)711 bool UT_UCS2_isupper(UT_UCS2Char c)
712 {
713 	if(c < 127)
714 		return isupper(c)!=0;
715 
716     case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
717     if(letter && letter->type == 1)
718         return true;
719     return false;
720 };
721 
UT_UCS2_islower(UT_UCS2Char c)722 bool UT_UCS2_islower(UT_UCS2Char c)
723 {
724 	if(c < 127)
725 		return islower(c)!=0;
726 
727     case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
728     if(!letter || letter->type == 0)
729         return true;
730     return false;
731 };
732 
UT_UCS2_isspace(UT_UCS2Char c)733 bool UT_UCS2_isspace(UT_UCS2Char c)
734 {
735 	// the whitespace table is small, so use linear search
736 	for (UT_uint32 i = 0; i < G_N_ELEMENTS(whitespace_table); i++)
737 	{
738 		if(whitespace_table[i].high < c)
739 			continue;
740 		if(whitespace_table[i].low <= c)
741 			return true;
742 		// if we got here, then low > c
743 		return false;
744 	}
745 	return false;
746 };
747 
UT_UCS2_isalpha(UT_UCS2Char c)748 bool UT_UCS2_isalpha(UT_UCS2Char c)
749 {
750     UT_BidiCharType type = UT_bidiGetCharType(c);
751     return (UT_BIDI_IS_LETTER(type) != 0);
752 };
753 
UT_UCS2_isSentenceSeparator(UT_UCS2Char c)754 bool UT_UCS2_isSentenceSeparator(UT_UCS2Char c)
755 {
756 	switch(c)
757 	{
758 	case '?': // fall-through
759 	case '!': // fall-through
760 	case '.':
761 		return true;
762 
763 	default:
764 		return false;
765 	}
766 }
767 
768 /* copies exactly n-chars from src to dest; NB! does not check for 00 i src
769 */
UT_UCS2_strncpy(UT_UCS2Char * dest,const UT_UCS2Char * src,UT_uint32 n)770 UT_UCS2Char * UT_UCS2_strncpy(UT_UCS2Char * dest, const UT_UCS2Char * src, UT_uint32 n)
771 {
772 	UT_ASSERT(dest);
773 	UT_ASSERT(src);
774 
775 	UT_UCS2Char * d = dest;
776 	UT_UCS2Char * s = static_cast<UT_UCS2Char *>(src);
777 
778 	for (; d < static_cast<UT_UCS2Char *>(dest) + n;)
779 		*d++ = *s++;
780 	*d = '\0';
781 
782 	return dest;
783 }
784 
785 
786 /* reverses str of len n; used by BiDi which always knows the len of string to process
787    thus we can save ourselves searching for the 00 */
UT_UCS2_strnrev(UT_UCS2Char * src,UT_uint32 n)788 UT_UCS2Char * UT_UCS2_strnrev(UT_UCS2Char * src, UT_uint32 n)
789 {
790     UT_UCS2Char t;
791     UT_uint32 i;
792 
793     for(i = 0; i < n/2; i++)
794     {
795         t = *(src + i);
796         *(src + i) = *(src + n - i - 1); //-1 so that we do not move the 00
797         *(src + n - i - 1) = t;
798     }
799     return src;
800 }
801 
802 #endif
803 
804 
805 ////////////////////////////////////////////////////////////////////////
806 //
807 //  UCS string (UT_UCSChar)
808 //
809 //  String is built of units based on UT_UCSChar, which used to be
810 //   UT_UCS2Char and is now UT_UCS4Char
811 //
812 ////////////////////////////////////////////////////////////////////////
813 
UT_isSmartQuotableCharacter(UT_UCSChar c)814 bool UT_isSmartQuotableCharacter(UT_UCSChar c)
815 {
816 	// TODO:  this is anglo-centric; really need a locale argument or
817 	// TODO:  something to get smart quote rules for the rest of the world
818 	bool result;
819 	switch (c)
820 	{
821 	case '"':
822 	case '`':
823 	case '\'':
824 		result = true;
825 		break;
826 	default:
827 		result = false;
828 		break;
829 	}
830 	return (result);
831 }
832 
UT_isSmartQuotedCharacter(UT_UCSChar c)833 bool UT_isSmartQuotedCharacter(UT_UCSChar c)
834 {
835 	bool result;
836 	switch (c)
837 	{
838 	case UCS_LQUOTE:
839 	case UCS_RQUOTE:
840 	case UCS_LDBLQUOTE:
841 	case UCS_RDBLQUOTE:
842 	case 0x201a:
843 	case 0x201e:
844 	case 0x2039:
845 	case 0x203a:
846 	case 0x300c:
847 	case 0x300d:
848 	case 0x300e:
849 	case 0x300f:
850 	case '\"':
851 	case '\'':
852 		result = true;
853 		break;
854 	default:
855 		result = false;
856 		break;
857 	}
858 	return (result);
859 }
860 
861 ////////////////////////////////////////////////////////////////////////
862 //
863 //  UCS-4 string
864 //
865 //  String is built of 32-bit units (longs)
866 //
867 //  NOTE: Ambiguity between UCS-2 and UTF-16 above makes no difference
868 //  NOTE:  in the case of UCS-4 and UTF-32 since they really are
869 //  NOTE:  identical
870 //
871 ////////////////////////////////////////////////////////////////////////
872 
UT_UCS4_isupper(UT_UCS4Char c)873 bool UT_UCS4_isupper(UT_UCS4Char c)
874 {
875 	if(c < 127)
876 		return isupper(c)!=0;
877 
878     case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
879     if(letter && letter->type == 1)
880         return true;
881     return false;
882 }
883 
UT_UCS4_islower(UT_UCS4Char c)884 bool UT_UCS4_islower(UT_UCS4Char c)
885 {
886 	if(c < 127)
887 		return islower(c)!=0;
888 
889     case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
890     if(!letter || letter->type == 0)
891         return true;
892     return false;
893 }
894 
UT_UCS4_isspace(UT_UCS4Char c)895 bool UT_UCS4_isspace(UT_UCS4Char c)
896 {
897 	// the whitespace table is small, so use linear search
898 	for (UT_uint32 i = 0; i < G_N_ELEMENTS(whitespace_table); i++)
899 	{
900 		if(whitespace_table[i].high < c)
901 			continue;
902 		if(whitespace_table[i].low <= c)
903 			return true;
904 		// if we got here, then low > c
905 		return false;
906 	}
907 	return false;
908 }
909 
UT_UCS4_isalpha(UT_UCS4Char c)910 bool UT_UCS4_isalpha(UT_UCS4Char c)
911 {
912     UT_BidiCharType type = UT_bidiGetCharType(c);
913     return (UT_BIDI_IS_LETTER(type) != 0);
914 }
915 
UT_UCS4_isSentenceSeparator(UT_UCS4Char c)916 bool UT_UCS4_isSentenceSeparator(UT_UCS4Char c)
917 {
918 	switch(c)
919 	{
920 	case '?': // fall-through
921 	case '!': // fall-through
922 	case '.':
923 		return true;
924 
925 	default:
926 		return false;
927 	}
928 }
929 
UT_UCS4_isdigit(UT_UCS4Char c)930 bool UT_UCS4_isdigit(UT_UCS4Char c)
931 {
932 	if (c < 0x700) {
933 		for (unsigned int i=0; i < G_N_ELEMENTS(digits_table); i++) {
934 			if (c < digits_table[i].low) break;
935 			if (c <= digits_table[i].high)
936 				return true;
937 		}
938 	} else {
939 		ucs_range * rng = static_cast<ucs_range *>(bsearch(&c, &digits_table,
940 			G_N_ELEMENTS(digits_table),sizeof(ucs_range),s_cmp_digits));
941 		if (rng) return true;
942 	}
943 	return false;
944 }
945 
946 /* copies exactly n-chars from src to dest; NB! does not check for 00 i src
947 */
UT_UCS4_strncpy(UT_UCS4Char * dest,const UT_UCS4Char * src,UT_uint32 n)948 UT_UCS4Char * UT_UCS4_strncpy(UT_UCS4Char * dest, const UT_UCS4Char * src, UT_uint32 n)
949 {
950 	UT_ASSERT(dest);
951 	UT_ASSERT(src);
952 
953 	UT_UCSChar * d = dest;
954 	const UT_UCSChar * s = static_cast<const UT_UCS4Char *>(src);
955 
956 	for (; d < static_cast<UT_UCS4Char *>(dest) + n;)
957 		*d++ = *s++;
958 	*d = '\0';
959 
960 	return dest;
961 }
962 
963 
964 /* reverses str of len n; used by BiDi which always knows the len of string to process
965    thus we can save ourselves searching for the 00 */
UT_UCS4_strnrev(UT_UCS4Char * src,UT_uint32 n)966 UT_UCS4Char * UT_UCS4_strnrev(UT_UCS4Char * src, UT_uint32 n)
967 {
968     UT_UCS4Char t;
969     UT_uint32 i;
970 
971     for(i = 0; i < n/2; i++)
972     {
973         t = *(src + i);
974         *(src + i) = *(src + n - i - 1); //-1 so that we do not move the 00
975         *(src + n - i - 1) = t;
976     }
977     return src;
978 }
979 
980 
UT_UCS4_strstr(const UT_UCS4Char * phaystack,const UT_UCS4Char * pneedle)981 UT_UCS4Char * UT_UCS4_strstr(const UT_UCS4Char * phaystack, const UT_UCS4Char * pneedle)
982 {
983 	register const UT_UCS4Char *haystack, *needle;
984 	register UT_UCS4Char b, c;
985 
986 	haystack = static_cast<const UT_UCS4Char *>(phaystack);
987 	needle = static_cast<const UT_UCS4Char *>(pneedle);
988 
989 	b = *needle;
990 	if (b != '\0')
991     {
992 		haystack--;                               /* possible ANSI violation */
993 		do
994         {
995 			c = *++haystack;
996 			if (c == '\0')
997 				goto ret0;
998         }
999 		while (c != b);
1000 
1001 		c = *++needle;
1002 		if (c == '\0')
1003 			goto foundneedle;
1004 		++needle;
1005 		goto jin;
1006 
1007 		for (;;)
1008         {
1009 			register UT_UCS4Char a;
1010 			register const UT_UCS4Char *rhaystack, *rneedle;
1011 
1012 			do
1013             {
1014 				a = *++haystack;
1015 				if (a == '\0')
1016 					goto ret0;
1017 				if (a == b)
1018 					break;
1019 				a = *++haystack;
1020 				if (a == '\0')
1021 					goto ret0;
1022 			shloop: ; // need a statement here for EGCS 1.1.1 to accept it
1023 			}
1024 			while (a != b);
1025 
1026 		jin:	a = *++haystack;
1027 			if (a == '\0')
1028 				goto ret0;
1029 
1030 			if (a != c)
1031 				goto shloop;
1032 
1033 			rhaystack = haystack-- + 1;
1034 			rneedle = needle;
1035 			a = *rneedle;
1036 
1037 			if (*rhaystack == a)
1038 				do
1039 				{
1040 					if (a == '\0')
1041 						goto foundneedle;
1042 					++rhaystack;
1043 					a = *++needle;
1044 					if (*rhaystack != a)
1045 						break;
1046 					if (a == '\0')
1047 						goto foundneedle;
1048 					++rhaystack;
1049 					a = *++needle;
1050 				}
1051 				while (*rhaystack == a);
1052 
1053 			needle = rneedle;             /* took the register-poor approach */
1054 
1055 			if (a == '\0')
1056 				break;
1057         }
1058     }
1059  foundneedle:
1060 	return const_cast<UT_UCS4Char *>(haystack);
1061  ret0:
1062 	return 0;
1063 }
1064 
UT_UCS4_strcmp(const UT_UCS4Char * left,const UT_UCS4Char * right)1065 UT_sint32 UT_UCS4_strcmp(const UT_UCS4Char* left, const UT_UCS4Char* right)
1066 {
1067 	UT_ASSERT(left);
1068 	UT_ASSERT(right);
1069 
1070 	while (*left && *right)
1071 	{
1072 		if (*left < *right)
1073 		{
1074 			return -1;
1075 		}
1076 
1077 		if (*left > *right)
1078 		{
1079 			return 1;
1080 		}
1081 
1082 		left++;
1083 		right++;
1084 	}
1085 
1086 	if (*left)
1087 	{
1088 		return -1;
1089 	}
1090 	else if (*right)
1091 	{
1092 		return 1;
1093 	}
1094 	else
1095 	{
1096 		return 0;
1097 	}
1098 }
1099 
1100 /*
1101   Latin-1 Unicode case-insensitive string comparison and casing done by
1102   Pierre Sarrazin <ps@cam.org>.
1103 */
1104 
1105 /**
1106  * Convert a given character to uppercase
1107  */
UT_UCS4_toupper(UT_UCS4Char c)1108 UT_UCS4Char UT_UCS4_toupper(UT_UCS4Char c)
1109 {
1110         if (c < 128) // in ASCII range
1111 	  return toupper(c);
1112 
1113 	if (XAP_EncodingManager::get_instance()->single_case())
1114 		return c;
1115 	/*let's trust libc! -- does not seem to work :(*/
1116     case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
1117     if(!letter || letter->type == 1)
1118         return c;
1119     return letter->other;
1120 }
1121 
1122 
1123 /*	Converts the given character to lowercase if it is an uppercase letter.
1124 	Returns it unchanged if it is not.
1125 	This function created by Pierre Sarrazin 1999-02-06
1126 */
1127 
UT_UCS4_tolower(UT_UCS4Char c)1128 UT_UCS4Char UT_UCS4_tolower(UT_UCS4Char c)
1129 {
1130 	if (c < 128)
1131 		return tolower(c);
1132 
1133 	if (XAP_EncodingManager::get_instance()->single_case())
1134 		return c;
1135 	/*let's trust libc!*/
1136     case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, G_N_ELEMENTS(case_table),sizeof(case_entry),s_cmp_case));
1137     if(!letter || letter->type == 0)
1138         return c;
1139     return letter->other;
1140 }
1141 
1142 
1143 /*	Characters are converted to lowercase (if applicable) when they
1144 	are read from the needle or the haystack. See UT_UCS_tolower().
1145 	This function created by Pierre Sarrazin 1999-02-06
1146 */
1147 
UT_UCS4_stristr(const UT_UCS4Char * phaystack,const UT_UCS4Char * pneedle)1148 UT_UCS4Char * UT_UCS4_stristr(const UT_UCS4Char * phaystack, const UT_UCS4Char * pneedle)
1149 {
1150 	register const UT_UCS4Char *haystack, *needle;
1151 	register UT_UCS4Char b, c;
1152 
1153 	haystack = static_cast<const UT_UCS4Char *>(phaystack);
1154 	needle = static_cast<const UT_UCS4Char *>(pneedle);
1155 
1156 	b = UT_UCS4_tolower(*needle);
1157 	if (b != '\0')
1158     {
1159 		haystack--;                               /* possible ANSI violation */
1160 		do
1161         {
1162 			c = UT_UCS4_tolower(*++haystack);
1163 			if (c == '\0')
1164 				goto ret0;
1165         }
1166 		while (c != b);
1167 
1168 		c = UT_UCS4_tolower(*++needle);
1169 		if (c == '\0')
1170 			goto foundneedle;
1171 		++needle;
1172 		goto jin;
1173 
1174 		for (;;)
1175         {
1176 			register UT_UCS4Char a;
1177 			register const UT_UCS4Char *rhaystack, *rneedle;
1178 
1179 			do
1180             {
1181 				a = UT_UCS4_tolower(*++haystack);
1182 				if (a == '\0')
1183 					goto ret0;
1184 				if (a == b)
1185 					break;
1186 				a = UT_UCS4_tolower(*++haystack);
1187 				if (a == '\0')
1188 					goto ret0;
1189 			shloop: ; // need a statement here for EGCS 1.1.1 to accept it
1190 			}
1191 			while (a != b);
1192 
1193 		jin:	a = UT_UCS4_tolower(*++haystack);
1194 			if (a == '\0')
1195 				goto ret0;
1196 
1197 			if (a != c)
1198 				goto shloop;
1199 
1200 			rhaystack = haystack-- + 1;
1201 			rneedle = needle;
1202 			a = UT_UCS4_tolower(*rneedle);
1203 
1204 			if (UT_UCS4_tolower(*rhaystack) == a)
1205 				do
1206 				{
1207 					if (a == '\0')
1208 						goto foundneedle;
1209 					++rhaystack;
1210 					a = UT_UCS4_tolower(*++needle);
1211 					if (UT_UCS4_tolower(*rhaystack) != a)
1212 						break;
1213 					if (a == '\0')
1214 						goto foundneedle;
1215 					++rhaystack;
1216 					a = UT_UCS4_tolower(*++needle);
1217 				}
1218 				while (UT_UCS4_tolower(*rhaystack) == a);
1219 
1220 			needle = rneedle;             /* took the register-poor approach */
1221 
1222 			if (a == '\0')
1223 				break;
1224         }
1225     }
1226  foundneedle:
1227 	return const_cast<UT_UCS4Char *>(haystack);
1228  ret0:
1229 	return 0;
1230 }
1231 /****************************************************************************/
1232 
UT_UCS4_strlen(const UT_UCS4Char * string)1233 UT_uint32 UT_UCS4_strlen(const UT_UCS4Char * string)
1234 {
1235 	UT_uint32 i;
1236 
1237 	for(i = 0; *string != 0; string++, i++)
1238 		;
1239 
1240 	return i;
1241 }
1242 
UT_UCS4_strlen_as_char(const UT_UCS4Char * string)1243 UT_uint32 UT_UCS4_strlen_as_char(const UT_UCS4Char * string)
1244 {
1245 	UT_uint32 i = 0;
1246 
1247 	char d[4]; // assuming that any character can be coded with no more that 4 bytes.
1248 
1249 	UT_Wctomb w(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
1250 
1251 	while (*string != 0)
1252 	  {
1253 		int length;
1254 		w.wctomb_or_fallback(d,length,*string++);
1255 		i+=length;
1256 	  }
1257 
1258 	return i;
1259 }
1260 
UT_UCS4_strcpy(UT_UCS4Char * dest,const UT_UCS4Char * src)1261 UT_UCS4Char * UT_UCS4_strcpy(UT_UCS4Char * dest, const UT_UCS4Char * src)
1262 {
1263 	UT_ASSERT(dest);
1264 	UT_ASSERT(src);
1265 
1266 	UT_UCS4Char * d = dest;
1267 	const UT_UCS4Char * s = static_cast<const UT_UCS4Char *>(src);
1268 
1269 	while (*s != 0)
1270 		*d++ = *s++;
1271 	*d = 0;
1272 
1273 	return dest;
1274 }
1275 
1276 // TODO shouldn't all of the 'char *' strings be 'unsigned char *' strings ??
1277 
UT_UCS4_strcpy_char(UT_UCS4Char * dest,const char * src)1278 UT_UCS4Char * UT_UCS4_strcpy_char(UT_UCS4Char * dest, const char * src)
1279 {
1280 	UT_ASSERT(dest);
1281 	UT_ASSERT(src);
1282 
1283 	UT_UCS4Char * d 		= dest;
1284 	const char * s	= static_cast<const char *>(src);
1285 
1286 	static UT_UCS4_mbtowc m(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
1287 	UT_UCS4Char wc;
1288 
1289 	while (*s != 0)
1290 	  {
1291 		if(m.mbtowc(wc,*s))*d++=wc;
1292 		s++;
1293 	  }
1294 	*d = 0;
1295 
1296 	return dest;
1297 }
1298 
UT_UCS4_strncpy_char(UT_UCS4Char * dest,const char * src,int n)1299 UT_UCS4Char * UT_UCS4_strncpy_char(UT_UCS4Char * dest, const char * src, int n)
1300 {
1301 	UT_ASSERT(dest);
1302 	UT_ASSERT(src);
1303 
1304 	UT_UCS4Char * d 		= dest;
1305 	const char * s	= static_cast<const char *>(src);
1306 
1307 	static UT_UCS4_mbtowc m(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
1308 	UT_UCS4Char wc;
1309 
1310 	while (*s != 0 && n > 0)
1311 	  {
1312 		if(m.mbtowc(wc,*s))*d++=wc;
1313 		s++;
1314 		n--;
1315 	  }
1316 	*d = 0;
1317 
1318 	return dest;
1319 }
1320 
UT_UCS4_strcpy_utf8_char(UT_UCS4Char * dest,const char * src)1321 UT_UCS4Char * UT_UCS4_strcpy_utf8_char(UT_UCS4Char * dest, const char * src)
1322 {
1323 	// FIXME: This could be more efficient than it is, on the other
1324 	// hand, it should be correct
1325 
1326 	UT_ASSERT(dest);
1327 	UT_ASSERT(src);
1328 
1329 	UT_UCS4String ucs4str(src); // constructs a string from UTF-8 by default
1330 	dest = UT_UCS4_strcpy(dest, ucs4str.ucs4_str());
1331 
1332 	return dest;
1333 }
1334 
1335 
UT_UCS4_strcpy_to_char(char * dest,const UT_UCS4Char * src)1336 char * UT_UCS4_strcpy_to_char(char * dest, const UT_UCS4Char * src)
1337 {
1338 	UT_ASSERT(dest);
1339 	UT_ASSERT(src);
1340 
1341 	char * 			d = dest;
1342 	const UT_UCS4Char * 	s = static_cast<const UT_UCS4Char *>(src);
1343 
1344 	UT_Wctomb w(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
1345 
1346 	while (*s != 0)
1347 	  {
1348 		int length;
1349 		w.wctomb_or_fallback(d,length,*s++);
1350 		d+=length;
1351 	  }
1352 	*d = 0;
1353 
1354 	return dest;
1355 }
1356 
UT_UCS4_strncpy_to_char(char * dest,const UT_UCS4Char * src,int n)1357 char * UT_UCS4_strncpy_to_char(char * dest, const UT_UCS4Char * src, int n)
1358 {
1359 	UT_ASSERT(dest);
1360 	UT_ASSERT(src);
1361 
1362 	char * 			d = dest;
1363 	const UT_UCS4Char * 	s = static_cast<const UT_UCS4Char *>(src);
1364 
1365 	UT_Wctomb w(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
1366 
1367 	while (*s != 0 && n > 0)
1368 	  {
1369 		int length;
1370 		w.wctomb_or_fallback(d,length,*s++, n);
1371 		d+=length;
1372 		n-=length;
1373 	  }
1374 	*d = 0;
1375 
1376 	return dest;
1377 }
1378 
UT_UCS4_cloneString(UT_UCS4Char ** dest,const UT_UCS4Char * src)1379 bool UT_UCS4_cloneString(UT_UCS4Char ** dest, const UT_UCS4Char * src)
1380 {
1381 	UT_uint32 length = UT_UCS4_strlen(src) + 1;
1382 	*dest = static_cast<UT_UCS4Char *>(UT_calloc(length,sizeof(UT_UCS4Char)));
1383 	if (!*dest)
1384 		return false;
1385 	memmove(*dest,src,length*sizeof(UT_UCS4Char));
1386 
1387 	return true;
1388 }
1389 
UT_UCS4_cloneString_char(UT_UCS4Char ** dest,const char * src)1390 bool UT_UCS4_cloneString_char(UT_UCS4Char ** dest, const char * src)
1391 {
1392   UT_uint32 length = strlen(src) + 1;
1393   *dest = static_cast<UT_UCS4Char *>(UT_calloc(length,sizeof(UT_UCS4Char)));
1394   if (!*dest)
1395     return false;
1396   UT_UCS4_strcpy_char(*dest, src);
1397 
1398   return true;
1399 }
1400 
s_pass_name(const char * & csstr,char end)1401 static const char * s_pass_name (const char *& csstr, char end)
1402 {
1403 	const char * name_end = csstr;
1404 
1405 	while (*csstr)
1406 	{
1407 		unsigned char u = static_cast<unsigned char>(*csstr);
1408 		if (u & 0x80)
1409 		{
1410 			UT_UTF8Stringbuf::UCS4Char ucs4 = UT_UTF8Stringbuf::charCode (csstr);
1411 			if (UT_UCS4_isspace (ucs4))
1412 			{
1413 				name_end = csstr;
1414 				break;
1415 			}
1416 			while (static_cast<unsigned char>(*++csstr) & 0x80)
1417 				;
1418 			continue;
1419 		}
1420 		else if ((isspace (static_cast<int>(u))) || (*csstr == end))
1421 		{
1422 			name_end = csstr;
1423 			break;
1424 		}
1425 		csstr++;
1426 	}
1427 	return name_end;
1428 }
1429 
1430 
s_pass_value(const char * & csstr)1431 static const char * s_pass_value (const char *& csstr)
1432 {
1433 	const char * value_end = csstr;
1434 
1435 	bool bQuoted = false;
1436 	while (*csstr)
1437 	{
1438 		bool bSpace = false;
1439 		unsigned char u = static_cast<unsigned char>(*csstr);
1440 		if (u & 0x80)
1441 		{
1442 			UT_UTF8Stringbuf::UCS4Char ucs4 = UT_UTF8Stringbuf::charCode (csstr);
1443 
1444 			if (!bQuoted)
1445 				if (UT_UCS4_isspace (ucs4))
1446 				{
1447 					bSpace = true;
1448 					break;
1449 				}
1450 			while (static_cast<unsigned char>(*++csstr) & 0x80)
1451 				;
1452 			if (!bSpace)
1453 				value_end = csstr;
1454 			continue;
1455 		}
1456 		else if ((*csstr == '\'') || (*csstr == '"'))
1457 		{
1458 			bQuoted = (bQuoted ? false : true);
1459 		}
1460 		else if (*csstr == ';')
1461 		{
1462 			if (!bQuoted)
1463 			{
1464 				csstr++;
1465 				break;
1466 			}
1467 		}
1468 		else if (!bQuoted && isspace (static_cast<int>(u)))
1469 			bSpace = true;
1470 
1471 		csstr++;
1472 		if (!bSpace)
1473 			value_end = csstr;
1474 	}
1475 	return value_end;
1476 }
1477 
1478 
s_pass_string(const char * & csstr_ptr)1479 static const char * s_pass_string (const char *& csstr_ptr)
1480 {
1481 	if (*csstr_ptr == 0)
1482 		return 0;
1483 
1484 	const char * csstr = csstr_ptr;
1485 
1486 	char quote = 0;
1487 
1488 	if ((*csstr == '\'') || (*csstr == '"'))
1489 		quote = *csstr;
1490 
1491 	bool valid = true;
1492 	bool skip = false;
1493 
1494 	while (true)
1495 	{
1496 		unsigned char u = static_cast<unsigned char>(*++csstr);
1497 
1498 		if ((u & 0xc0) == 0x80)
1499 			continue; // trailing byte
1500 		if (u == 0)
1501 		{
1502 			valid = false;
1503 			break;
1504 		}
1505 		if (skip)
1506 		{
1507 			skip = false;
1508 			continue;
1509 		}
1510 		if (*csstr == quote)
1511 		{
1512 			++csstr;
1513 			break;
1514 		}
1515 		if (*csstr == '\\')
1516 			skip = true;
1517 	}
1518 	if (valid)
1519 	{
1520 		csstr_ptr = csstr;
1521 		csstr--;
1522 	}
1523 	else
1524 	{
1525 		csstr = csstr_ptr;
1526 	}
1527 	return csstr; // points to end quote on success, and to start quote on failure
1528 }
1529 
s_pass_whitespace(const char * & csstr)1530 static void s_pass_whitespace (const char *& csstr)
1531 {
1532 	while (*csstr)
1533 	{
1534 		unsigned char u = static_cast<unsigned char>(*csstr);
1535 		if (u & 0x80)
1536 		{
1537 			UT_UTF8Stringbuf::UCS4Char ucs4 = UT_UTF8Stringbuf::charCode (csstr);
1538 			if (UT_UCS4_isspace (ucs4))
1539 			{
1540 				while (static_cast<unsigned char>(*++csstr) & 0x80)
1541 					;
1542 				continue;
1543 			}
1544 		}
1545 		else if (isspace (static_cast<int>(u)))
1546 		{
1547 			csstr++;
1548 			continue;
1549 		}
1550 		break;
1551 	}
1552 }
1553 
1554 
UT_parse_attributes(const char * attributes,std::map<std::string,std::string> & map)1555 void UT_parse_attributes(const char * attributes,
1556 						 std::map<std::string, std::string> & map)
1557 {
1558 	if ( attributes == 0)
1559 		return;
1560 	if (*attributes == 0)
1561 		return;
1562 
1563 	const char * atstr = attributes;
1564 
1565 	std::string name;
1566 	std::string value;
1567 
1568 	while (*atstr)
1569 	{
1570 		s_pass_whitespace (atstr);
1571 
1572 		const char * name_start = atstr;
1573 		const char * name_end   = s_pass_name (atstr, '=');
1574 
1575 		if (*atstr != '=')
1576 			break; // whatever we have, it's not a name="value" pair
1577 		if (name_start == name_end)
1578 			break; // ?? stray equals?
1579 
1580 		name.clear();
1581 		std::copy(name_start, name_end, name.begin());
1582 
1583 		atstr++;
1584 
1585 		if ((*atstr != '\'') && (*atstr != '"'))
1586 			break; // whatever we have, it's not a name="value" pair
1587 
1588 		const char * value_start = atstr;
1589 		const char * value_end   = s_pass_string (atstr);
1590 
1591 		if (value_start == value_end)
1592 			break; // ?? no value...
1593 
1594 		value_start++;
1595 
1596 		value.clear();
1597 		std::copy(value_start, value_end, value.begin());
1598 
1599 		map[name] = value;
1600 	}
1601 }
1602 
1603 
UT_parse_properties(const char * properties,std::map<std::string,std::string> & map)1604 void UT_parse_properties(const char * properties,
1605 									std::map<std::string, std::string> & map)
1606 {
1607 	if ( properties == 0)
1608 		return;
1609 	if (*properties == 0)
1610 		return;
1611 
1612 	const char * csstr = properties;
1613 
1614 	std::string name;
1615 	std::string value;
1616 
1617 	bool bSkip = false;
1618 
1619 	while (*csstr)
1620 	{
1621 		if (bSkip)
1622 		{
1623 			if (*csstr == ';')
1624 				bSkip = false;
1625 			++csstr;
1626 			continue;
1627 		}
1628 		s_pass_whitespace (csstr);
1629 
1630 		const char * name_start = csstr;
1631 		const char * name_end   = s_pass_name (csstr, ':');
1632 
1633 		if (*csstr == 0) break; // whatever we have, it's not a "name:value;" pair
1634 		if (name_start == name_end) // ?? stray colon?
1635 		{
1636 			bSkip = true;
1637 			continue;
1638 		}
1639 		name.resize(name_end - name_start);
1640 		std::copy(name_start, name_end, name.begin());
1641 
1642 		s_pass_whitespace (csstr);
1643 		if (*csstr != ':') // whatever we have, it's not a "name:value;" pair
1644 		{
1645 			bSkip = true;
1646 			continue;
1647 		}
1648 
1649 		csstr++;
1650 		s_pass_whitespace (csstr);
1651 
1652 		if (*csstr == 0)
1653 			break; // whatever we have, it's not a "name:value;" pair
1654 
1655 		const char * value_start = csstr;
1656 		const char * value_end   = s_pass_value (csstr);
1657 
1658 		if (value_start == value_end) // ?? no value...
1659 		{
1660 			bSkip = true;
1661 			continue;
1662 		}
1663 		value.resize(value_end - value_start);
1664 		std::copy(value_start, value_end, value.begin());
1665 
1666 		map[name] = value;
1667 	}
1668 }
1669 
1670 /*
1671  this one prints floating point value but using dot as fractional separator
1672  independent of the current locale's settings.
1673 */
std_size_string(float f)1674 const char* std_size_string(float f)
1675 {
1676   static char string[24];
1677   int i=static_cast<int>(f);
1678   if(f-i<0.1) {
1679     sprintf(string, "%d", i);
1680   } else {
1681     int fr = int(10*(f-i));
1682     sprintf(string,"%d.%d", i, fr);
1683   }
1684   return string;
1685 }
1686 
1687 #ifndef TOOLKIT_WIN
1688 
UT_bidiGetCharType(UT_UCS4Char c)1689 UT_BidiCharType UT_bidiGetCharType(UT_UCS4Char c)
1690 {
1691 #ifndef NO_BIDI_SUPPORT
1692 	return fribidi_get_type(c);
1693 #else
1694 	return UT_BIDI_LTR;
1695 #endif
1696 }
1697 
1698 /*!
1699     pStrOut needs to contain space for len characters + terminating 0
1700 */
UT_bidiReorderString(const UT_UCS4Char * pStrIn,UT_uint32 len,UT_BidiCharType baseDir,UT_UCS4Char * pStrOut)1701 bool UT_bidiReorderString(const UT_UCS4Char * pStrIn, UT_uint32 len, UT_BidiCharType baseDir,
1702 						  UT_UCS4Char * pStrOut)
1703 {
1704 	UT_return_val_if_fail( pStrIn && pStrOut, false );
1705 
1706 #ifndef NO_BIDI_SUPPORT
1707 	// this works around 8685; this should be left here, in fact any decent optimising
1708 	// compiler should remove this code if the bug does not exist
1709 	if(sizeof(FriBidiChar) > sizeof(UT_UCS4Char))
1710 	{
1711 		static FriBidiChar* pFBDC = NULL;
1712 		static FriBidiChar* pFBDC2 = NULL;
1713 		static UT_uint32 iFBDlen = 0;
1714 
1715 		if(iFBDlen < len + 1)
1716 		{
1717 			delete [] pFBDC; delete [] pFBDC2;
1718 			iFBDlen = 0;
1719 
1720 			pFBDC = new FriBidiChar [len + 1];
1721 			pFBDC2 = new FriBidiChar [len + 1];
1722 
1723 			UT_return_val_if_fail( pFBDC && pFBDC2, false );
1724 
1725 			iFBDlen = len + 1;
1726 		}
1727 
1728 		UT_uint32 i;
1729 		for(i = 0; i < len; ++i)
1730 		{
1731 			pFBDC[i] = (FriBidiChar) pStrIn[i];
1732 		}
1733 
1734 		pFBDC[i] = 0;
1735 
1736 		int iRet = fribidi_log2vis (pFBDC, len, &baseDir, pFBDC2, NULL, NULL, NULL);
1737 
1738 		for(i = 0; i < len; ++i)
1739 		{
1740 			pStrOut[i] = (UT_UCS4Char) pFBDC2[i];
1741 		}
1742 
1743 		pStrOut[i] = 0;
1744 
1745 		return iRet;
1746 	}
1747 	else
1748 	{
1749 		return (0 != fribidi_log2vis ((FriBidiChar *)pStrIn, len, &baseDir, (FriBidiChar*)pStrOut, NULL, NULL, NULL));
1750 	}
1751 
1752 #else
1753 	if(!pStrIn || !*pStrIn)
1754 		return true;
1755 
1756 	UT_return_val_if_fail( pStrOut, false );
1757 
1758 	UT_UCS4_strncpy(pStrOut, pStrIn, len);
1759 	return true;
1760 #endif
1761 }
1762 
UT_bidiMapLog2Vis(const UT_UCS4Char * pStrIn,UT_uint32 len,UT_BidiCharType baseDir,UT_uint32 * pL2V,UT_uint32 * pV2L,UT_Byte * pEmbed)1763 bool UT_bidiMapLog2Vis(const UT_UCS4Char * pStrIn, UT_uint32 len, UT_BidiCharType baseDir,
1764 					   UT_uint32 *pL2V, UT_uint32 * pV2L, UT_Byte * pEmbed)
1765 {
1766 #ifndef NO_BIDI_SUPPORT
1767 	// if this assert fails, we have a serious problem ...
1768 	UT_ASSERT_HARMLESS( sizeof(UT_UCS4Char) == sizeof(FriBidiChar) );
1769 	return (0 != fribidi_log2vis ((FriBidiChar *)pStrIn, len, &baseDir,
1770 								  NULL, (FriBidiStrIndex*)pL2V, (FriBidiStrIndex*)pV2L, (FriBidiLevel*)pEmbed));
1771 #else
1772 	UT_return_val_if_fail( pL2V && pV2L && pEmbed, false );
1773 	for(UT_uint32 i = 0; i < len; ++i)
1774 	{
1775 		pL2V[i] = i;
1776 		pV2L[i] = i;
1777 		pEmbed[i] = 0;
1778 	}
1779 
1780 	return true;
1781 #endif
1782 }
1783 
UT_bidiGetMirrorChar(UT_UCS4Char c,UT_UCS4Char & mc)1784 bool UT_bidiGetMirrorChar(UT_UCS4Char c, UT_UCS4Char &mc)
1785 {
1786 #ifndef NO_BIDI_SUPPORT
1787 	return (0 != fribidi_get_mirror_char(c, (FriBidiChar*)&mc));
1788 #else
1789 	return false;
1790 #endif
1791 }
1792 
1793 
1794 #endif
1795