1 /*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16 #define IN_LIBXML
17 #include "libxml.h"
18
19 #include <stdlib.h>
20 #include <string.h>
21 #include <libxml/xmlmemory.h>
22 #include <libxml/parserInternals.h>
23 #include <libxml/xmlstring.h>
24
25 /************************************************************************
26 * *
27 * Commodity functions to handle xmlChars *
28 * *
29 ************************************************************************/
30
31 /**
32 * xmlStrndup:
33 * @cur: the input xmlChar *
34 * @len: the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40 xmlChar *
xmlStrndup(const xmlChar * cur,int len)41 xmlStrndup(const xmlChar *cur, int len) {
42 xmlChar *ret;
43
44 if ((cur == NULL) || (len < 0)) return(NULL);
45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46 if (ret == NULL) {
47 xmlErrMemory(NULL, NULL);
48 return(NULL);
49 }
50 memcpy(ret, cur, len * sizeof(xmlChar));
51 ret[len] = 0;
52 return(ret);
53 }
54
55 /**
56 * xmlStrdup:
57 * @cur: the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65 xmlChar *
xmlStrdup(const xmlChar * cur)66 xmlStrdup(const xmlChar *cur) {
67 const xmlChar *p = cur;
68
69 if (cur == NULL) return(NULL);
70 while (*p != 0) p++; /* non input consuming */
71 return(xmlStrndup(cur, p - cur));
72 }
73
74 /**
75 * xmlCharStrndup:
76 * @cur: the input char *
77 * @len: the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84 xmlChar *
xmlCharStrndup(const char * cur,int len)85 xmlCharStrndup(const char *cur, int len) {
86 int i;
87 xmlChar *ret;
88
89 if ((cur == NULL) || (len < 0)) return(NULL);
90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91 if (ret == NULL) {
92 xmlErrMemory(NULL, NULL);
93 return(NULL);
94 }
95 for (i = 0;i < len;i++) {
96 ret[i] = (xmlChar) cur[i];
97 if (ret[i] == 0) return(ret);
98 }
99 ret[len] = 0;
100 return(ret);
101 }
102
103 /**
104 * xmlCharStrdup:
105 * @cur: the input char *
106 *
107 * a strdup for char's to xmlChar's
108 *
109 * Returns a new xmlChar * or NULL
110 */
111
112 xmlChar *
xmlCharStrdup(const char * cur)113 xmlCharStrdup(const char *cur) {
114 const char *p = cur;
115
116 if (cur == NULL) return(NULL);
117 while (*p != '\0') p++; /* non input consuming */
118 return(xmlCharStrndup(cur, p - cur));
119 }
120
121 /**
122 * xmlStrcmp:
123 * @str1: the first xmlChar *
124 * @str2: the second xmlChar *
125 *
126 * a strcmp for xmlChar's
127 *
128 * Returns the integer result of the comparison
129 */
130
131 int
xmlStrcmp(const xmlChar * str1,const xmlChar * str2)132 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133 register int tmp;
134
135 if (str1 == str2) return(0);
136 if (str1 == NULL) return(-1);
137 if (str2 == NULL) return(1);
138 do {
139 tmp = *str1++ - *str2;
140 if (tmp != 0) return(tmp);
141 } while (*str2++ != 0);
142 return 0;
143 }
144
145 /**
146 * xmlStrEqual:
147 * @str1: the first xmlChar *
148 * @str2: the second xmlChar *
149 *
150 * Check if both strings are equal of have same content.
151 * Should be a bit more readable and faster than xmlStrcmp()
152 *
153 * Returns 1 if they are equal, 0 if they are different
154 */
155
156 int
xmlStrEqual(const xmlChar * str1,const xmlChar * str2)157 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158 if (str1 == str2) return(1);
159 if (str1 == NULL) return(0);
160 if (str2 == NULL) return(0);
161 do {
162 if (*str1++ != *str2) return(0);
163 } while (*str2++);
164 return(1);
165 }
166
167 /**
168 * xmlStrQEqual:
169 * @pref: the prefix of the QName
170 * @name: the localname of the QName
171 * @str: the second xmlChar *
172 *
173 * Check if a QName is Equal to a given string
174 *
175 * Returns 1 if they are equal, 0 if they are different
176 */
177
178 int
xmlStrQEqual(const xmlChar * pref,const xmlChar * name,const xmlChar * str)179 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180 if (pref == NULL) return(xmlStrEqual(name, str));
181 if (name == NULL) return(0);
182 if (str == NULL) return(0);
183
184 do {
185 if (*pref++ != *str) return(0);
186 } while ((*str++) && (*pref));
187 if (*str++ != ':') return(0);
188 do {
189 if (*name++ != *str) return(0);
190 } while (*str++);
191 return(1);
192 }
193
194 /**
195 * xmlStrncmp:
196 * @str1: the first xmlChar *
197 * @str2: the second xmlChar *
198 * @len: the max comparison length
199 *
200 * a strncmp for xmlChar's
201 *
202 * Returns the integer result of the comparison
203 */
204
205 int
xmlStrncmp(const xmlChar * str1,const xmlChar * str2,int len)206 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207 register int tmp;
208
209 if (len <= 0) return(0);
210 if (str1 == str2) return(0);
211 if (str1 == NULL) return(-1);
212 if (str2 == NULL) return(1);
213 #ifdef __GNUC__
214 tmp = strncmp((const char *)str1, (const char *)str2, len);
215 return tmp;
216 #else
217 do {
218 tmp = *str1++ - *str2;
219 if (tmp != 0 || --len == 0) return(tmp);
220 } while (*str2++ != 0);
221 return 0;
222 #endif
223 }
224
225 static const xmlChar casemap[256] = {
226 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
258 };
259
260 /**
261 * xmlStrcasecmp:
262 * @str1: the first xmlChar *
263 * @str2: the second xmlChar *
264 *
265 * a strcasecmp for xmlChar's
266 *
267 * Returns the integer result of the comparison
268 */
269
270 int
xmlStrcasecmp(const xmlChar * str1,const xmlChar * str2)271 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
272 register int tmp;
273
274 if (str1 == str2) return(0);
275 if (str1 == NULL) return(-1);
276 if (str2 == NULL) return(1);
277 do {
278 tmp = casemap[*str1++] - casemap[*str2];
279 if (tmp != 0) return(tmp);
280 } while (*str2++ != 0);
281 return 0;
282 }
283
284 /**
285 * xmlStrncasecmp:
286 * @str1: the first xmlChar *
287 * @str2: the second xmlChar *
288 * @len: the max comparison length
289 *
290 * a strncasecmp for xmlChar's
291 *
292 * Returns the integer result of the comparison
293 */
294
295 int
xmlStrncasecmp(const xmlChar * str1,const xmlChar * str2,int len)296 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
297 register int tmp;
298
299 if (len <= 0) return(0);
300 if (str1 == str2) return(0);
301 if (str1 == NULL) return(-1);
302 if (str2 == NULL) return(1);
303 do {
304 tmp = casemap[*str1++] - casemap[*str2];
305 if (tmp != 0 || --len == 0) return(tmp);
306 } while (*str2++ != 0);
307 return 0;
308 }
309
310 /**
311 * xmlStrchr:
312 * @str: the xmlChar * array
313 * @val: the xmlChar to search
314 *
315 * a strchr for xmlChar's
316 *
317 * Returns the xmlChar * for the first occurrence or NULL.
318 */
319
320 const xmlChar *
xmlStrchr(const xmlChar * str,xmlChar val)321 xmlStrchr(const xmlChar *str, xmlChar val) {
322 if (str == NULL) return(NULL);
323 while (*str != 0) { /* non input consuming */
324 if (*str == val) return((xmlChar *) str);
325 str++;
326 }
327 return(NULL);
328 }
329
330 /**
331 * xmlStrstr:
332 * @str: the xmlChar * array (haystack)
333 * @val: the xmlChar to search (needle)
334 *
335 * a strstr for xmlChar's
336 *
337 * Returns the xmlChar * for the first occurrence or NULL.
338 */
339
340 const xmlChar *
xmlStrstr(const xmlChar * str,const xmlChar * val)341 xmlStrstr(const xmlChar *str, const xmlChar *val) {
342 int n;
343
344 if (str == NULL) return(NULL);
345 if (val == NULL) return(NULL);
346 n = xmlStrlen(val);
347
348 if (n == 0) return(str);
349 while (*str != 0) { /* non input consuming */
350 if (*str == *val) {
351 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352 }
353 str++;
354 }
355 return(NULL);
356 }
357
358 /**
359 * xmlStrcasestr:
360 * @str: the xmlChar * array (haystack)
361 * @val: the xmlChar to search (needle)
362 *
363 * a case-ignoring strstr for xmlChar's
364 *
365 * Returns the xmlChar * for the first occurrence or NULL.
366 */
367
368 const xmlChar *
xmlStrcasestr(const xmlChar * str,const xmlChar * val)369 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
370 int n;
371
372 if (str == NULL) return(NULL);
373 if (val == NULL) return(NULL);
374 n = xmlStrlen(val);
375
376 if (n == 0) return(str);
377 while (*str != 0) { /* non input consuming */
378 if (casemap[*str] == casemap[*val])
379 if (!xmlStrncasecmp(str, val, n)) return(str);
380 str++;
381 }
382 return(NULL);
383 }
384
385 /**
386 * xmlStrsub:
387 * @str: the xmlChar * array (haystack)
388 * @start: the index of the first char (zero based)
389 * @len: the length of the substring
390 *
391 * Extract a substring of a given string
392 *
393 * Returns the xmlChar * for the first occurrence or NULL.
394 */
395
396 xmlChar *
xmlStrsub(const xmlChar * str,int start,int len)397 xmlStrsub(const xmlChar *str, int start, int len) {
398 int i;
399
400 if (str == NULL) return(NULL);
401 if (start < 0) return(NULL);
402 if (len < 0) return(NULL);
403
404 for (i = 0;i < start;i++) {
405 if (*str == 0) return(NULL);
406 str++;
407 }
408 if (*str == 0) return(NULL);
409 return(xmlStrndup(str, len));
410 }
411
412 /**
413 * xmlStrlen:
414 * @str: the xmlChar * array
415 *
416 * length of a xmlChar's string
417 *
418 * Returns the number of xmlChar contained in the ARRAY.
419 */
420
421 int
xmlStrlen(const xmlChar * str)422 xmlStrlen(const xmlChar *str) {
423 int len = 0;
424
425 if (str == NULL) return(0);
426 while (*str != 0) { /* non input consuming */
427 str++;
428 len++;
429 }
430 return(len);
431 }
432
433 /**
434 * xmlStrncat:
435 * @cur: the original xmlChar * array
436 * @add: the xmlChar * array added
437 * @len: the length of @add
438 *
439 * a strncat for array of xmlChar's, it will extend @cur with the len
440 * first bytes of @add. Note that if @len < 0 then this is an API error
441 * and NULL will be returned.
442 *
443 * Returns a new xmlChar *, the original @cur is reallocated and should
444 * not be freed.
445 */
446
447 xmlChar *
xmlStrncat(xmlChar * cur,const xmlChar * add,int len)448 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449 int size;
450 xmlChar *ret;
451
452 if ((add == NULL) || (len == 0))
453 return(cur);
454 if (len < 0)
455 return(NULL);
456 if (cur == NULL)
457 return(xmlStrndup(add, len));
458
459 size = xmlStrlen(cur);
460 if (size < 0)
461 return(NULL);
462 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
463 if (ret == NULL) {
464 xmlErrMemory(NULL, NULL);
465 return(cur);
466 }
467 memcpy(&ret[size], add, len * sizeof(xmlChar));
468 ret[size + len] = 0;
469 return(ret);
470 }
471
472 /**
473 * xmlStrncatNew:
474 * @str1: first xmlChar string
475 * @str2: second xmlChar string
476 * @len: the len of @str2 or < 0
477 *
478 * same as xmlStrncat, but creates a new string. The original
479 * two strings are not freed. If @len is < 0 then the length
480 * will be calculated automatically.
481 *
482 * Returns a new xmlChar * or NULL
483 */
484 xmlChar *
xmlStrncatNew(const xmlChar * str1,const xmlChar * str2,int len)485 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
486 int size;
487 xmlChar *ret;
488
489 if (len < 0) {
490 len = xmlStrlen(str2);
491 if (len < 0)
492 return(NULL);
493 }
494 if ((str2 == NULL) || (len == 0))
495 return(xmlStrdup(str1));
496 if (str1 == NULL)
497 return(xmlStrndup(str2, len));
498
499 size = xmlStrlen(str1);
500 if (size < 0)
501 return(NULL);
502 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
503 if (ret == NULL) {
504 xmlErrMemory(NULL, NULL);
505 return(xmlStrndup(str1, size));
506 }
507 memcpy(ret, str1, size * sizeof(xmlChar));
508 memcpy(&ret[size], str2, len * sizeof(xmlChar));
509 ret[size + len] = 0;
510 return(ret);
511 }
512
513 /**
514 * xmlStrcat:
515 * @cur: the original xmlChar * array
516 * @add: the xmlChar * array added
517 *
518 * a strcat for array of xmlChar's. Since they are supposed to be
519 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
520 * a termination mark of '0'.
521 *
522 * Returns a new xmlChar * containing the concatenated string. The original
523 * @cur is reallocated and should not be freed.
524 */
525 xmlChar *
xmlStrcat(xmlChar * cur,const xmlChar * add)526 xmlStrcat(xmlChar *cur, const xmlChar *add) {
527 const xmlChar *p = add;
528
529 if (add == NULL) return(cur);
530 if (cur == NULL)
531 return(xmlStrdup(add));
532
533 while (*p != 0) p++; /* non input consuming */
534 return(xmlStrncat(cur, add, p - add));
535 }
536
537 /**
538 * xmlStrPrintf:
539 * @buf: the result buffer.
540 * @len: the result buffer length.
541 * @msg: the message with printf formatting.
542 * @...: extra parameters for the message.
543 *
544 * Formats @msg and places result into @buf.
545 *
546 * Returns the number of characters written to @buf or -1 if an error occurs.
547 */
548 int XMLCDECL
xmlStrPrintf(xmlChar * buf,int len,const char * msg,...)549 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
550 va_list args;
551 int ret;
552
553 if((buf == NULL) || (msg == NULL)) {
554 return(-1);
555 }
556
557 va_start(args, msg);
558 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
559 va_end(args);
560 buf[len - 1] = 0; /* be safe ! */
561
562 return(ret);
563 }
564
565 /**
566 * xmlStrVPrintf:
567 * @buf: the result buffer.
568 * @len: the result buffer length.
569 * @msg: the message with printf formatting.
570 * @ap: extra parameters for the message.
571 *
572 * Formats @msg and places result into @buf.
573 *
574 * Returns the number of characters written to @buf or -1 if an error occurs.
575 */
576 int
xmlStrVPrintf(xmlChar * buf,int len,const char * msg,va_list ap)577 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
578 int ret;
579
580 if((buf == NULL) || (msg == NULL)) {
581 return(-1);
582 }
583
584 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
585 buf[len - 1] = 0; /* be safe ! */
586
587 return(ret);
588 }
589
590 /************************************************************************
591 * *
592 * Generic UTF8 handling routines *
593 * *
594 * From rfc2044: encoding of the Unicode values on UTF-8: *
595 * *
596 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
597 * 0000 0000-0000 007F 0xxxxxxx *
598 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
599 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
600 * *
601 * I hope we won't use values > 0xFFFF anytime soon ! *
602 * *
603 ************************************************************************/
604
605
606 /**
607 * xmlUTF8Size:
608 * @utf: pointer to the UTF8 character
609 *
610 * calculates the internal size of a UTF8 character
611 *
612 * returns the numbers of bytes in the character, -1 on format error
613 */
614 int
xmlUTF8Size(const xmlChar * utf)615 xmlUTF8Size(const xmlChar *utf) {
616 xmlChar mask;
617 int len;
618
619 if (utf == NULL)
620 return -1;
621 if (*utf < 0x80)
622 return 1;
623 /* check valid UTF8 character */
624 if (!(*utf & 0x40))
625 return -1;
626 /* determine number of bytes in char */
627 len = 2;
628 for (mask=0x20; mask != 0; mask>>=1) {
629 if (!(*utf & mask))
630 return len;
631 len++;
632 }
633 return -1;
634 }
635
636 /**
637 * xmlUTF8Charcmp:
638 * @utf1: pointer to first UTF8 char
639 * @utf2: pointer to second UTF8 char
640 *
641 * compares the two UCS4 values
642 *
643 * returns result of the compare as with xmlStrncmp
644 */
645 int
xmlUTF8Charcmp(const xmlChar * utf1,const xmlChar * utf2)646 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
647
648 if (utf1 == NULL ) {
649 if (utf2 == NULL)
650 return 0;
651 return -1;
652 }
653 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
654 }
655
656 /**
657 * xmlUTF8Strlen:
658 * @utf: a sequence of UTF-8 encoded bytes
659 *
660 * compute the length of an UTF8 string, it doesn't do a full UTF8
661 * checking of the content of the string.
662 *
663 * Returns the number of characters in the string or -1 in case of error
664 */
665 int
xmlUTF8Strlen(const xmlChar * utf)666 xmlUTF8Strlen(const xmlChar *utf) {
667 int ret = 0;
668
669 if (utf == NULL)
670 return(-1);
671
672 while (*utf != 0) {
673 if (utf[0] & 0x80) {
674 if ((utf[1] & 0xc0) != 0x80)
675 return(-1);
676 if ((utf[0] & 0xe0) == 0xe0) {
677 if ((utf[2] & 0xc0) != 0x80)
678 return(-1);
679 if ((utf[0] & 0xf0) == 0xf0) {
680 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
681 return(-1);
682 utf += 4;
683 } else {
684 utf += 3;
685 }
686 } else {
687 utf += 2;
688 }
689 } else {
690 utf++;
691 }
692 ret++;
693 }
694 return(ret);
695 }
696
697 /**
698 * xmlGetUTF8Char:
699 * @utf: a sequence of UTF-8 encoded bytes
700 * @len: a pointer to the minimum number of bytes present in
701 * the sequence. This is used to assure the next character
702 * is completely contained within the sequence.
703 *
704 * Read the first UTF8 character from @utf
705 *
706 * Returns the char value or -1 in case of error, and sets *len to
707 * the actual number of bytes consumed (0 in case of error)
708 */
709 int
xmlGetUTF8Char(const unsigned char * utf,int * len)710 xmlGetUTF8Char(const unsigned char *utf, int *len) {
711 unsigned int c;
712
713 if (utf == NULL)
714 goto error;
715 if (len == NULL)
716 goto error;
717 if (*len < 1)
718 goto error;
719
720 c = utf[0];
721 if (c & 0x80) {
722 if (*len < 2)
723 goto error;
724 if ((utf[1] & 0xc0) != 0x80)
725 goto error;
726 if ((c & 0xe0) == 0xe0) {
727 if (*len < 3)
728 goto error;
729 if ((utf[2] & 0xc0) != 0x80)
730 goto error;
731 if ((c & 0xf0) == 0xf0) {
732 if (*len < 4)
733 goto error;
734 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
735 goto error;
736 *len = 4;
737 /* 4-byte code */
738 c = (utf[0] & 0x7) << 18;
739 c |= (utf[1] & 0x3f) << 12;
740 c |= (utf[2] & 0x3f) << 6;
741 c |= utf[3] & 0x3f;
742 } else {
743 /* 3-byte code */
744 *len = 3;
745 c = (utf[0] & 0xf) << 12;
746 c |= (utf[1] & 0x3f) << 6;
747 c |= utf[2] & 0x3f;
748 }
749 } else {
750 /* 2-byte code */
751 *len = 2;
752 c = (utf[0] & 0x1f) << 6;
753 c |= utf[1] & 0x3f;
754 }
755 } else {
756 /* 1-byte code */
757 *len = 1;
758 }
759 return(c);
760
761 error:
762 if (len != NULL)
763 *len = 0;
764 return(-1);
765 }
766
767 /**
768 * xmlCheckUTF8:
769 * @utf: Pointer to putative UTF-8 encoded string.
770 *
771 * Checks @utf for being valid UTF-8. @utf is assumed to be
772 * null-terminated. This function is not super-strict, as it will
773 * allow longer UTF-8 sequences than necessary. Note that Java is
774 * capable of producing these sequences if provoked. Also note, this
775 * routine checks for the 4-byte maximum size, but does not check for
776 * 0x10ffff maximum value.
777 *
778 * Return value: true if @utf is valid.
779 **/
780 int
xmlCheckUTF8(const unsigned char * utf)781 xmlCheckUTF8(const unsigned char *utf)
782 {
783 int ix;
784 unsigned char c;
785
786 if (utf == NULL)
787 return(0);
788 /*
789 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
790 * are as follows (in "bit format"):
791 * 0xxxxxxx valid 1-byte
792 * 110xxxxx 10xxxxxx valid 2-byte
793 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
794 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
795 */
796 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
797 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
798 ix++;
799 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
800 if ((utf[ix+1] & 0xc0 ) != 0x80)
801 return 0;
802 ix += 2;
803 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
804 if (((utf[ix+1] & 0xc0) != 0x80) ||
805 ((utf[ix+2] & 0xc0) != 0x80))
806 return 0;
807 ix += 3;
808 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
809 if (((utf[ix+1] & 0xc0) != 0x80) ||
810 ((utf[ix+2] & 0xc0) != 0x80) ||
811 ((utf[ix+3] & 0xc0) != 0x80))
812 return 0;
813 ix += 4;
814 } else /* unknown encoding */
815 return 0;
816 }
817 return(1);
818 }
819
820 /**
821 * xmlUTF8Strsize:
822 * @utf: a sequence of UTF-8 encoded bytes
823 * @len: the number of characters in the array
824 *
825 * storage size of an UTF8 string
826 * the behaviour is not guaranteed if the input string is not UTF-8
827 *
828 * Returns the storage size of
829 * the first 'len' characters of ARRAY
830 */
831
832 int
xmlUTF8Strsize(const xmlChar * utf,int len)833 xmlUTF8Strsize(const xmlChar *utf, int len) {
834 const xmlChar *ptr=utf;
835 xmlChar ch;
836
837 if (utf == NULL)
838 return(0);
839
840 if (len <= 0)
841 return(0);
842
843 while ( len-- > 0) {
844 if ( !*ptr )
845 break;
846 if ( (ch = *ptr++) & 0x80) {
847 // Workaround for an optimization bug in VS 2015 Update 2, remove
848 // once the fix is released. crbug.com/599427
849 // https://connect.microsoft.com/VisualStudio/feedback/details/2582138
850 xmlChar ch2 = ch;
851 while ((ch2<<=1) & 0x80 ) {
852 ptr++;
853 if (*ptr == 0) break;
854 }
855 }
856 }
857 return (ptr - utf);
858 }
859
860 /**
861 * xmlUTF8Strndup:
862 * @utf: the input UTF8 *
863 * @len: the len of @utf (in chars)
864 *
865 * a strndup for array of UTF8's
866 *
867 * Returns a new UTF8 * or NULL
868 */
869 xmlChar *
xmlUTF8Strndup(const xmlChar * utf,int len)870 xmlUTF8Strndup(const xmlChar *utf, int len) {
871 xmlChar *ret;
872 int i;
873
874 if ((utf == NULL) || (len < 0)) return(NULL);
875 i = xmlUTF8Strsize(utf, len);
876 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
877 if (ret == NULL) {
878 xmlGenericError(xmlGenericErrorContext,
879 "malloc of %ld byte failed\n",
880 (len + 1) * (long)sizeof(xmlChar));
881 return(NULL);
882 }
883 memcpy(ret, utf, i * sizeof(xmlChar));
884 ret[i] = 0;
885 return(ret);
886 }
887
888 /**
889 * xmlUTF8Strpos:
890 * @utf: the input UTF8 *
891 * @pos: the position of the desired UTF8 char (in chars)
892 *
893 * a function to provide the equivalent of fetching a
894 * character from a string array
895 *
896 * Returns a pointer to the UTF8 character or NULL
897 */
898 const xmlChar *
xmlUTF8Strpos(const xmlChar * utf,int pos)899 xmlUTF8Strpos(const xmlChar *utf, int pos) {
900 xmlChar ch;
901
902 if (utf == NULL) return(NULL);
903 if (pos < 0)
904 return(NULL);
905 while (pos--) {
906 if ((ch=*utf++) == 0) return(NULL);
907 if ( ch & 0x80 ) {
908 /* if not simple ascii, verify proper format */
909 if ( (ch & 0xc0) != 0xc0 )
910 return(NULL);
911 /* then skip over remaining bytes for this char */
912 while ( (ch <<= 1) & 0x80 )
913 if ( (*utf++ & 0xc0) != 0x80 )
914 return(NULL);
915 }
916 }
917 return((xmlChar *)utf);
918 }
919
920 /**
921 * xmlUTF8Strloc:
922 * @utf: the input UTF8 *
923 * @utfchar: the UTF8 character to be found
924 *
925 * a function to provide the relative location of a UTF8 char
926 *
927 * Returns the relative character position of the desired char
928 * or -1 if not found
929 */
930 int
xmlUTF8Strloc(const xmlChar * utf,const xmlChar * utfchar)931 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
932 int i, size;
933 xmlChar ch;
934
935 if (utf==NULL || utfchar==NULL) return -1;
936 size = xmlUTF8Strsize(utfchar, 1);
937 for(i=0; (ch=*utf) != 0; i++) {
938 if (xmlStrncmp(utf, utfchar, size)==0)
939 return(i);
940 utf++;
941 if ( ch & 0x80 ) {
942 /* if not simple ascii, verify proper format */
943 if ( (ch & 0xc0) != 0xc0 )
944 return(-1);
945 /* then skip over remaining bytes for this char */
946 while ( (ch <<= 1) & 0x80 )
947 if ( (*utf++ & 0xc0) != 0x80 )
948 return(-1);
949 }
950 }
951
952 return(-1);
953 }
954 /**
955 * xmlUTF8Strsub:
956 * @utf: a sequence of UTF-8 encoded bytes
957 * @start: relative pos of first char
958 * @len: total number to copy
959 *
960 * Create a substring from a given UTF-8 string
961 * Note: positions are given in units of UTF-8 chars
962 *
963 * Returns a pointer to a newly created string
964 * or NULL if any problem
965 */
966
967 xmlChar *
xmlUTF8Strsub(const xmlChar * utf,int start,int len)968 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
969 int i;
970 xmlChar ch;
971
972 if (utf == NULL) return(NULL);
973 if (start < 0) return(NULL);
974 if (len < 0) return(NULL);
975
976 /*
977 * Skip over any leading chars
978 */
979 for (i = 0;i < start;i++) {
980 if ((ch=*utf++) == 0) return(NULL);
981 if ( ch & 0x80 ) {
982 /* if not simple ascii, verify proper format */
983 if ( (ch & 0xc0) != 0xc0 )
984 return(NULL);
985 /* then skip over remaining bytes for this char */
986 while ( (ch <<= 1) & 0x80 )
987 if ( (*utf++ & 0xc0) != 0x80 )
988 return(NULL);
989 }
990 }
991
992 return(xmlUTF8Strndup(utf, len));
993 }
994
995 /**
996 * xmlEscapeFormatString:
997 * @msg: a pointer to the string in which to escape '%' characters.
998 * Must be a heap-allocated buffer created by libxml2 that may be
999 * returned, or that may be freed and replaced.
1000 *
1001 * Replaces the string pointed to by 'msg' with an escaped string.
1002 * Returns the same string with all '%' characters escaped.
1003 */
1004 xmlChar *
xmlEscapeFormatString(xmlChar ** msg)1005 xmlEscapeFormatString(xmlChar **msg)
1006 {
1007 xmlChar *msgPtr = NULL;
1008 xmlChar *result = NULL;
1009 xmlChar *resultPtr = NULL;
1010 size_t count = 0;
1011 size_t msgLen = 0;
1012 size_t resultLen = 0;
1013
1014 if (!msg || !*msg)
1015 return(NULL);
1016
1017 for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1018 ++msgLen;
1019 if (*msgPtr == '%')
1020 ++count;
1021 }
1022
1023 if (count == 0)
1024 return(*msg);
1025
1026 resultLen = msgLen + count + 1;
1027 result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1028 if (result == NULL) {
1029 /* Clear *msg to prevent format string vulnerabilities in
1030 out-of-memory situations. */
1031 xmlFree(*msg);
1032 *msg = NULL;
1033 xmlErrMemory(NULL, NULL);
1034 return(NULL);
1035 }
1036
1037 for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1038 *resultPtr = *msgPtr;
1039 if (*msgPtr == '%')
1040 *(++resultPtr) = '%';
1041 }
1042 result[resultLen - 1] = '\0';
1043
1044 xmlFree(*msg);
1045 *msg = result;
1046
1047 return *msg;
1048 }
1049
1050 #define bottom_xmlstring
1051 #include "elfgcchack.h"
1052