1 /*
2  * string.c : an XML string utilities module
3  *
4  * This module provides various utility functions for manipulating
5  * the xmlChar* type. All functions named xmlStr* have been moved here
6  * from the parser.c file (their original home).
7  *
8  * See Copyright for the status of this software.
9  *
10  * UTF8 string routines from:
11  * William Brack <wbrack@mmm.com.hk>
12  *
13  * daniel@veillard.com
14  */
15 
16 #define IN_LIBXML
17 #include "libxml.h"
18 
19 #include <stdlib.h>
20 #include <string.h>
21 #include <libxml/xmlmemory.h>
22 #include <libxml/parserInternals.h>
23 #include <libxml/xmlstring.h>
24 
25 /************************************************************************
26  *                                                                      *
27  *                Commodity functions to handle xmlChars                *
28  *                                                                      *
29  ************************************************************************/
30 
31 /**
32  * xmlStrndup:
33  * @cur:  the input xmlChar *
34  * @len:  the len of @cur
35  *
36  * a strndup for array of xmlChar's
37  *
38  * Returns a new xmlChar * or NULL
39  */
40 xmlChar *
xmlStrndup(const xmlChar * cur,int len)41 xmlStrndup(const xmlChar *cur, int len) {
42     xmlChar *ret;
43 
44     if ((cur == NULL) || (len < 0)) return(NULL);
45     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46     if (ret == NULL) {
47         xmlErrMemory(NULL, NULL);
48         return(NULL);
49     }
50     memcpy(ret, cur, len * sizeof(xmlChar));
51     ret[len] = 0;
52     return(ret);
53 }
54 
55 /**
56  * xmlStrdup:
57  * @cur:  the input xmlChar *
58  *
59  * a strdup for array of xmlChar's. Since they are supposed to be
60  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61  * a termination mark of '0'.
62  *
63  * Returns a new xmlChar * or NULL
64  */
65 xmlChar *
xmlStrdup(const xmlChar * cur)66 xmlStrdup(const xmlChar *cur) {
67     const xmlChar *p = cur;
68 
69     if (cur == NULL) return(NULL);
70     while (*p != 0) p++; /* non input consuming */
71     return(xmlStrndup(cur, p - cur));
72 }
73 
74 /**
75  * xmlCharStrndup:
76  * @cur:  the input char *
77  * @len:  the len of @cur
78  *
79  * a strndup for char's to xmlChar's
80  *
81  * Returns a new xmlChar * or NULL
82  */
83 
84 xmlChar *
xmlCharStrndup(const char * cur,int len)85 xmlCharStrndup(const char *cur, int len) {
86     int i;
87     xmlChar *ret;
88 
89     if ((cur == NULL) || (len < 0)) return(NULL);
90     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91     if (ret == NULL) {
92         xmlErrMemory(NULL, NULL);
93         return(NULL);
94     }
95     for (i = 0;i < len;i++) {
96         ret[i] = (xmlChar) cur[i];
97         if (ret[i] == 0) return(ret);
98     }
99     ret[len] = 0;
100     return(ret);
101 }
102 
103 /**
104  * xmlCharStrdup:
105  * @cur:  the input char *
106  *
107  * a strdup for char's to xmlChar's
108  *
109  * Returns a new xmlChar * or NULL
110  */
111 
112 xmlChar *
xmlCharStrdup(const char * cur)113 xmlCharStrdup(const char *cur) {
114     const char *p = cur;
115 
116     if (cur == NULL) return(NULL);
117     while (*p != '\0') p++; /* non input consuming */
118     return(xmlCharStrndup(cur, p - cur));
119 }
120 
121 /**
122  * xmlStrcmp:
123  * @str1:  the first xmlChar *
124  * @str2:  the second xmlChar *
125  *
126  * a strcmp for xmlChar's
127  *
128  * Returns the integer result of the comparison
129  */
130 
131 int
xmlStrcmp(const xmlChar * str1,const xmlChar * str2)132 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133     register int tmp;
134 
135     if (str1 == str2) return(0);
136     if (str1 == NULL) return(-1);
137     if (str2 == NULL) return(1);
138     do {
139         tmp = *str1++ - *str2;
140         if (tmp != 0) return(tmp);
141     } while (*str2++ != 0);
142     return 0;
143 }
144 
145 /**
146  * xmlStrEqual:
147  * @str1:  the first xmlChar *
148  * @str2:  the second xmlChar *
149  *
150  * Check if both strings are equal of have same content.
151  * Should be a bit more readable and faster than xmlStrcmp()
152  *
153  * Returns 1 if they are equal, 0 if they are different
154  */
155 
156 int
xmlStrEqual(const xmlChar * str1,const xmlChar * str2)157 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158     if (str1 == str2) return(1);
159     if (str1 == NULL) return(0);
160     if (str2 == NULL) return(0);
161     do {
162         if (*str1++ != *str2) return(0);
163     } while (*str2++);
164     return(1);
165 }
166 
167 /**
168  * xmlStrQEqual:
169  * @pref:  the prefix of the QName
170  * @name:  the localname of the QName
171  * @str:  the second xmlChar *
172  *
173  * Check if a QName is Equal to a given string
174  *
175  * Returns 1 if they are equal, 0 if they are different
176  */
177 
178 int
xmlStrQEqual(const xmlChar * pref,const xmlChar * name,const xmlChar * str)179 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180     if (pref == NULL) return(xmlStrEqual(name, str));
181     if (name == NULL) return(0);
182     if (str == NULL) return(0);
183 
184     do {
185         if (*pref++ != *str) return(0);
186     } while ((*str++) && (*pref));
187     if (*str++ != ':') return(0);
188     do {
189         if (*name++ != *str) return(0);
190     } while (*str++);
191     return(1);
192 }
193 
194 /**
195  * xmlStrncmp:
196  * @str1:  the first xmlChar *
197  * @str2:  the second xmlChar *
198  * @len:  the max comparison length
199  *
200  * a strncmp for xmlChar's
201  *
202  * Returns the integer result of the comparison
203  */
204 
205 int
xmlStrncmp(const xmlChar * str1,const xmlChar * str2,int len)206 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207     register int tmp;
208 
209     if (len <= 0) return(0);
210     if (str1 == str2) return(0);
211     if (str1 == NULL) return(-1);
212     if (str2 == NULL) return(1);
213 #ifdef __GNUC__
214     tmp = strncmp((const char *)str1, (const char *)str2, len);
215     return tmp;
216 #else
217     do {
218         tmp = *str1++ - *str2;
219         if (tmp != 0 || --len == 0) return(tmp);
220     } while (*str2++ != 0);
221     return 0;
222 #endif
223 }
224 
225 static const xmlChar casemap[256] = {
226     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227     0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228     0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229     0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230     0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231     0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232     0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233     0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234     0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237     0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238     0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241     0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242     0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243     0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244     0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245     0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246     0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247     0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248     0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249     0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250     0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251     0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252     0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253     0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254     0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255     0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256     0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257     0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
258 };
259 
260 /**
261  * xmlStrcasecmp:
262  * @str1:  the first xmlChar *
263  * @str2:  the second xmlChar *
264  *
265  * a strcasecmp for xmlChar's
266  *
267  * Returns the integer result of the comparison
268  */
269 
270 int
xmlStrcasecmp(const xmlChar * str1,const xmlChar * str2)271 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
272     register int tmp;
273 
274     if (str1 == str2) return(0);
275     if (str1 == NULL) return(-1);
276     if (str2 == NULL) return(1);
277     do {
278         tmp = casemap[*str1++] - casemap[*str2];
279         if (tmp != 0) return(tmp);
280     } while (*str2++ != 0);
281     return 0;
282 }
283 
284 /**
285  * xmlStrncasecmp:
286  * @str1:  the first xmlChar *
287  * @str2:  the second xmlChar *
288  * @len:  the max comparison length
289  *
290  * a strncasecmp for xmlChar's
291  *
292  * Returns the integer result of the comparison
293  */
294 
295 int
xmlStrncasecmp(const xmlChar * str1,const xmlChar * str2,int len)296 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
297     register int tmp;
298 
299     if (len <= 0) return(0);
300     if (str1 == str2) return(0);
301     if (str1 == NULL) return(-1);
302     if (str2 == NULL) return(1);
303     do {
304         tmp = casemap[*str1++] - casemap[*str2];
305         if (tmp != 0 || --len == 0) return(tmp);
306     } while (*str2++ != 0);
307     return 0;
308 }
309 
310 /**
311  * xmlStrchr:
312  * @str:  the xmlChar * array
313  * @val:  the xmlChar to search
314  *
315  * a strchr for xmlChar's
316  *
317  * Returns the xmlChar * for the first occurrence or NULL.
318  */
319 
320 const xmlChar *
xmlStrchr(const xmlChar * str,xmlChar val)321 xmlStrchr(const xmlChar *str, xmlChar val) {
322     if (str == NULL) return(NULL);
323     while (*str != 0) { /* non input consuming */
324         if (*str == val) return((xmlChar *) str);
325         str++;
326     }
327     return(NULL);
328 }
329 
330 /**
331  * xmlStrstr:
332  * @str:  the xmlChar * array (haystack)
333  * @val:  the xmlChar to search (needle)
334  *
335  * a strstr for xmlChar's
336  *
337  * Returns the xmlChar * for the first occurrence or NULL.
338  */
339 
340 const xmlChar *
xmlStrstr(const xmlChar * str,const xmlChar * val)341 xmlStrstr(const xmlChar *str, const xmlChar *val) {
342     int n;
343 
344     if (str == NULL) return(NULL);
345     if (val == NULL) return(NULL);
346     n = xmlStrlen(val);
347 
348     if (n == 0) return(str);
349     while (*str != 0) { /* non input consuming */
350         if (*str == *val) {
351             if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352         }
353         str++;
354     }
355     return(NULL);
356 }
357 
358 /**
359  * xmlStrcasestr:
360  * @str:  the xmlChar * array (haystack)
361  * @val:  the xmlChar to search (needle)
362  *
363  * a case-ignoring strstr for xmlChar's
364  *
365  * Returns the xmlChar * for the first occurrence or NULL.
366  */
367 
368 const xmlChar *
xmlStrcasestr(const xmlChar * str,const xmlChar * val)369 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
370     int n;
371 
372     if (str == NULL) return(NULL);
373     if (val == NULL) return(NULL);
374     n = xmlStrlen(val);
375 
376     if (n == 0) return(str);
377     while (*str != 0) { /* non input consuming */
378         if (casemap[*str] == casemap[*val])
379             if (!xmlStrncasecmp(str, val, n)) return(str);
380         str++;
381     }
382     return(NULL);
383 }
384 
385 /**
386  * xmlStrsub:
387  * @str:  the xmlChar * array (haystack)
388  * @start:  the index of the first char (zero based)
389  * @len:  the length of the substring
390  *
391  * Extract a substring of a given string
392  *
393  * Returns the xmlChar * for the first occurrence or NULL.
394  */
395 
396 xmlChar *
xmlStrsub(const xmlChar * str,int start,int len)397 xmlStrsub(const xmlChar *str, int start, int len) {
398     int i;
399 
400     if (str == NULL) return(NULL);
401     if (start < 0) return(NULL);
402     if (len < 0) return(NULL);
403 
404     for (i = 0;i < start;i++) {
405         if (*str == 0) return(NULL);
406         str++;
407     }
408     if (*str == 0) return(NULL);
409     return(xmlStrndup(str, len));
410 }
411 
412 /**
413  * xmlStrlen:
414  * @str:  the xmlChar * array
415  *
416  * length of a xmlChar's string
417  *
418  * Returns the number of xmlChar contained in the ARRAY.
419  */
420 
421 int
xmlStrlen(const xmlChar * str)422 xmlStrlen(const xmlChar *str) {
423     int len = 0;
424 
425     if (str == NULL) return(0);
426     while (*str != 0) { /* non input consuming */
427         str++;
428         len++;
429     }
430     return(len);
431 }
432 
433 /**
434  * xmlStrncat:
435  * @cur:  the original xmlChar * array
436  * @add:  the xmlChar * array added
437  * @len:  the length of @add
438  *
439  * a strncat for array of xmlChar's, it will extend @cur with the len
440  * first bytes of @add. Note that if @len < 0 then this is an API error
441  * and NULL will be returned.
442  *
443  * Returns a new xmlChar *, the original @cur is reallocated and should
444  * not be freed.
445  */
446 
447 xmlChar *
xmlStrncat(xmlChar * cur,const xmlChar * add,int len)448 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449     int size;
450     xmlChar *ret;
451 
452     if ((add == NULL) || (len == 0))
453         return(cur);
454     if (len < 0)
455 	return(NULL);
456     if (cur == NULL)
457         return(xmlStrndup(add, len));
458 
459     size = xmlStrlen(cur);
460     if (size < 0)
461         return(NULL);
462     ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
463     if (ret == NULL) {
464         xmlErrMemory(NULL, NULL);
465         return(cur);
466     }
467     memcpy(&ret[size], add, len * sizeof(xmlChar));
468     ret[size + len] = 0;
469     return(ret);
470 }
471 
472 /**
473  * xmlStrncatNew:
474  * @str1:  first xmlChar string
475  * @str2:  second xmlChar string
476  * @len:  the len of @str2 or < 0
477  *
478  * same as xmlStrncat, but creates a new string.  The original
479  * two strings are not freed. If @len is < 0 then the length
480  * will be calculated automatically.
481  *
482  * Returns a new xmlChar * or NULL
483  */
484 xmlChar *
xmlStrncatNew(const xmlChar * str1,const xmlChar * str2,int len)485 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
486     int size;
487     xmlChar *ret;
488 
489     if (len < 0) {
490         len = xmlStrlen(str2);
491         if (len < 0)
492             return(NULL);
493     }
494     if ((str2 == NULL) || (len == 0))
495         return(xmlStrdup(str1));
496     if (str1 == NULL)
497         return(xmlStrndup(str2, len));
498 
499     size = xmlStrlen(str1);
500     if (size < 0)
501         return(NULL);
502     ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
503     if (ret == NULL) {
504         xmlErrMemory(NULL, NULL);
505         return(xmlStrndup(str1, size));
506     }
507     memcpy(ret, str1, size * sizeof(xmlChar));
508     memcpy(&ret[size], str2, len * sizeof(xmlChar));
509     ret[size + len] = 0;
510     return(ret);
511 }
512 
513 /**
514  * xmlStrcat:
515  * @cur:  the original xmlChar * array
516  * @add:  the xmlChar * array added
517  *
518  * a strcat for array of xmlChar's. Since they are supposed to be
519  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
520  * a termination mark of '0'.
521  *
522  * Returns a new xmlChar * containing the concatenated string. The original
523  * @cur is reallocated and should not be freed.
524  */
525 xmlChar *
xmlStrcat(xmlChar * cur,const xmlChar * add)526 xmlStrcat(xmlChar *cur, const xmlChar *add) {
527     const xmlChar *p = add;
528 
529     if (add == NULL) return(cur);
530     if (cur == NULL)
531         return(xmlStrdup(add));
532 
533     while (*p != 0) p++; /* non input consuming */
534     return(xmlStrncat(cur, add, p - add));
535 }
536 
537 /**
538  * xmlStrPrintf:
539  * @buf:   the result buffer.
540  * @len:   the result buffer length.
541  * @msg:   the message with printf formatting.
542  * @...:   extra parameters for the message.
543  *
544  * Formats @msg and places result into @buf.
545  *
546  * Returns the number of characters written to @buf or -1 if an error occurs.
547  */
548 int XMLCDECL
xmlStrPrintf(xmlChar * buf,int len,const char * msg,...)549 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
550     va_list args;
551     int ret;
552 
553     if((buf == NULL) || (msg == NULL)) {
554         return(-1);
555     }
556 
557     va_start(args, msg);
558     ret = vsnprintf((char *) buf, len, (const char *) msg, args);
559     va_end(args);
560     buf[len - 1] = 0; /* be safe ! */
561 
562     return(ret);
563 }
564 
565 /**
566  * xmlStrVPrintf:
567  * @buf:   the result buffer.
568  * @len:   the result buffer length.
569  * @msg:   the message with printf formatting.
570  * @ap:    extra parameters for the message.
571  *
572  * Formats @msg and places result into @buf.
573  *
574  * Returns the number of characters written to @buf or -1 if an error occurs.
575  */
576 int
xmlStrVPrintf(xmlChar * buf,int len,const char * msg,va_list ap)577 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
578     int ret;
579 
580     if((buf == NULL) || (msg == NULL)) {
581         return(-1);
582     }
583 
584     ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
585     buf[len - 1] = 0; /* be safe ! */
586 
587     return(ret);
588 }
589 
590 /************************************************************************
591  *                                                                      *
592  *              Generic UTF8 handling routines                          *
593  *                                                                      *
594  * From rfc2044: encoding of the Unicode values on UTF-8:               *
595  *                                                                      *
596  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
597  * 0000 0000-0000 007F   0xxxxxxx                                       *
598  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
599  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
600  *                                                                      *
601  * I hope we won't use values > 0xFFFF anytime soon !                   *
602  *                                                                      *
603  ************************************************************************/
604 
605 
606 /**
607  * xmlUTF8Size:
608  * @utf: pointer to the UTF8 character
609  *
610  * calculates the internal size of a UTF8 character
611  *
612  * returns the numbers of bytes in the character, -1 on format error
613  */
614 int
xmlUTF8Size(const xmlChar * utf)615 xmlUTF8Size(const xmlChar *utf) {
616     xmlChar mask;
617     int len;
618 
619     if (utf == NULL)
620         return -1;
621     if (*utf < 0x80)
622         return 1;
623     /* check valid UTF8 character */
624     if (!(*utf & 0x40))
625         return -1;
626     /* determine number of bytes in char */
627     len = 2;
628     for (mask=0x20; mask != 0; mask>>=1) {
629         if (!(*utf & mask))
630             return len;
631         len++;
632     }
633     return -1;
634 }
635 
636 /**
637  * xmlUTF8Charcmp:
638  * @utf1: pointer to first UTF8 char
639  * @utf2: pointer to second UTF8 char
640  *
641  * compares the two UCS4 values
642  *
643  * returns result of the compare as with xmlStrncmp
644  */
645 int
xmlUTF8Charcmp(const xmlChar * utf1,const xmlChar * utf2)646 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
647 
648     if (utf1 == NULL ) {
649         if (utf2 == NULL)
650             return 0;
651         return -1;
652     }
653     return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
654 }
655 
656 /**
657  * xmlUTF8Strlen:
658  * @utf:  a sequence of UTF-8 encoded bytes
659  *
660  * compute the length of an UTF8 string, it doesn't do a full UTF8
661  * checking of the content of the string.
662  *
663  * Returns the number of characters in the string or -1 in case of error
664  */
665 int
xmlUTF8Strlen(const xmlChar * utf)666 xmlUTF8Strlen(const xmlChar *utf) {
667     int ret = 0;
668 
669     if (utf == NULL)
670         return(-1);
671 
672     while (*utf != 0) {
673         if (utf[0] & 0x80) {
674             if ((utf[1] & 0xc0) != 0x80)
675                 return(-1);
676             if ((utf[0] & 0xe0) == 0xe0) {
677                 if ((utf[2] & 0xc0) != 0x80)
678                     return(-1);
679                 if ((utf[0] & 0xf0) == 0xf0) {
680                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
681                         return(-1);
682                     utf += 4;
683                 } else {
684                     utf += 3;
685                 }
686             } else {
687                 utf += 2;
688             }
689         } else {
690             utf++;
691         }
692         ret++;
693     }
694     return(ret);
695 }
696 
697 /**
698  * xmlGetUTF8Char:
699  * @utf:  a sequence of UTF-8 encoded bytes
700  * @len:  a pointer to the minimum number of bytes present in
701  *        the sequence.  This is used to assure the next character
702  *        is completely contained within the sequence.
703  *
704  * Read the first UTF8 character from @utf
705  *
706  * Returns the char value or -1 in case of error, and sets *len to
707  *        the actual number of bytes consumed (0 in case of error)
708  */
709 int
xmlGetUTF8Char(const unsigned char * utf,int * len)710 xmlGetUTF8Char(const unsigned char *utf, int *len) {
711     unsigned int c;
712 
713     if (utf == NULL)
714         goto error;
715     if (len == NULL)
716         goto error;
717     if (*len < 1)
718         goto error;
719 
720     c = utf[0];
721     if (c & 0x80) {
722         if (*len < 2)
723             goto error;
724         if ((utf[1] & 0xc0) != 0x80)
725             goto error;
726         if ((c & 0xe0) == 0xe0) {
727             if (*len < 3)
728                 goto error;
729             if ((utf[2] & 0xc0) != 0x80)
730                 goto error;
731             if ((c & 0xf0) == 0xf0) {
732                 if (*len < 4)
733                     goto error;
734                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
735                     goto error;
736                 *len = 4;
737                 /* 4-byte code */
738                 c = (utf[0] & 0x7) << 18;
739                 c |= (utf[1] & 0x3f) << 12;
740                 c |= (utf[2] & 0x3f) << 6;
741                 c |= utf[3] & 0x3f;
742             } else {
743               /* 3-byte code */
744                 *len = 3;
745                 c = (utf[0] & 0xf) << 12;
746                 c |= (utf[1] & 0x3f) << 6;
747                 c |= utf[2] & 0x3f;
748             }
749         } else {
750           /* 2-byte code */
751             *len = 2;
752             c = (utf[0] & 0x1f) << 6;
753             c |= utf[1] & 0x3f;
754         }
755     } else {
756         /* 1-byte code */
757         *len = 1;
758     }
759     return(c);
760 
761 error:
762     if (len != NULL)
763 	*len = 0;
764     return(-1);
765 }
766 
767 /**
768  * xmlCheckUTF8:
769  * @utf: Pointer to putative UTF-8 encoded string.
770  *
771  * Checks @utf for being valid UTF-8. @utf is assumed to be
772  * null-terminated. This function is not super-strict, as it will
773  * allow longer UTF-8 sequences than necessary. Note that Java is
774  * capable of producing these sequences if provoked. Also note, this
775  * routine checks for the 4-byte maximum size, but does not check for
776  * 0x10ffff maximum value.
777  *
778  * Return value: true if @utf is valid.
779  **/
780 int
xmlCheckUTF8(const unsigned char * utf)781 xmlCheckUTF8(const unsigned char *utf)
782 {
783     int ix;
784     unsigned char c;
785 
786     if (utf == NULL)
787         return(0);
788     /*
789      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
790      * are as follows (in "bit format"):
791      *    0xxxxxxx                                      valid 1-byte
792      *    110xxxxx 10xxxxxx                             valid 2-byte
793      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
794      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
795      */
796     for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
797         if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
798             ix++;
799 	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
800 	    if ((utf[ix+1] & 0xc0 ) != 0x80)
801 	        return 0;
802 	    ix += 2;
803 	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
804 	    if (((utf[ix+1] & 0xc0) != 0x80) ||
805 	        ((utf[ix+2] & 0xc0) != 0x80))
806 		    return 0;
807 	    ix += 3;
808 	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
809 	    if (((utf[ix+1] & 0xc0) != 0x80) ||
810 	        ((utf[ix+2] & 0xc0) != 0x80) ||
811 		((utf[ix+3] & 0xc0) != 0x80))
812 		    return 0;
813 	    ix += 4;
814 	} else				/* unknown encoding */
815 	    return 0;
816       }
817       return(1);
818 }
819 
820 /**
821  * xmlUTF8Strsize:
822  * @utf:  a sequence of UTF-8 encoded bytes
823  * @len:  the number of characters in the array
824  *
825  * storage size of an UTF8 string
826  * the behaviour is not guaranteed if the input string is not UTF-8
827  *
828  * Returns the storage size of
829  * the first 'len' characters of ARRAY
830  */
831 
832 int
xmlUTF8Strsize(const xmlChar * utf,int len)833 xmlUTF8Strsize(const xmlChar *utf, int len) {
834     const xmlChar   *ptr=utf;
835     xmlChar         ch;
836 
837     if (utf == NULL)
838         return(0);
839 
840     if (len <= 0)
841         return(0);
842 
843     while ( len-- > 0) {
844         if ( !*ptr )
845             break;
846         if ( (ch = *ptr++) & 0x80) {
847             // Workaround for an optimization bug in VS 2015 Update 2, remove
848             // once the fix is released. crbug.com/599427
849             // https://connect.microsoft.com/VisualStudio/feedback/details/2582138
850             xmlChar ch2 = ch;
851             while ((ch2<<=1) & 0x80 ) {
852                 ptr++;
853                 if (*ptr == 0) break;
854             }
855         }
856     }
857     return (ptr - utf);
858 }
859 
860 /**
861  * xmlUTF8Strndup:
862  * @utf:  the input UTF8 *
863  * @len:  the len of @utf (in chars)
864  *
865  * a strndup for array of UTF8's
866  *
867  * Returns a new UTF8 * or NULL
868  */
869 xmlChar *
xmlUTF8Strndup(const xmlChar * utf,int len)870 xmlUTF8Strndup(const xmlChar *utf, int len) {
871     xmlChar *ret;
872     int i;
873 
874     if ((utf == NULL) || (len < 0)) return(NULL);
875     i = xmlUTF8Strsize(utf, len);
876     ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
877     if (ret == NULL) {
878         xmlGenericError(xmlGenericErrorContext,
879                 "malloc of %ld byte failed\n",
880                 (len + 1) * (long)sizeof(xmlChar));
881         return(NULL);
882     }
883     memcpy(ret, utf, i * sizeof(xmlChar));
884     ret[i] = 0;
885     return(ret);
886 }
887 
888 /**
889  * xmlUTF8Strpos:
890  * @utf:  the input UTF8 *
891  * @pos:  the position of the desired UTF8 char (in chars)
892  *
893  * a function to provide the equivalent of fetching a
894  * character from a string array
895  *
896  * Returns a pointer to the UTF8 character or NULL
897  */
898 const xmlChar *
xmlUTF8Strpos(const xmlChar * utf,int pos)899 xmlUTF8Strpos(const xmlChar *utf, int pos) {
900     xmlChar ch;
901 
902     if (utf == NULL) return(NULL);
903     if (pos < 0)
904         return(NULL);
905     while (pos--) {
906         if ((ch=*utf++) == 0) return(NULL);
907         if ( ch & 0x80 ) {
908             /* if not simple ascii, verify proper format */
909             if ( (ch & 0xc0) != 0xc0 )
910                 return(NULL);
911             /* then skip over remaining bytes for this char */
912             while ( (ch <<= 1) & 0x80 )
913                 if ( (*utf++ & 0xc0) != 0x80 )
914                     return(NULL);
915         }
916     }
917     return((xmlChar *)utf);
918 }
919 
920 /**
921  * xmlUTF8Strloc:
922  * @utf:  the input UTF8 *
923  * @utfchar:  the UTF8 character to be found
924  *
925  * a function to provide the relative location of a UTF8 char
926  *
927  * Returns the relative character position of the desired char
928  * or -1 if not found
929  */
930 int
xmlUTF8Strloc(const xmlChar * utf,const xmlChar * utfchar)931 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
932     int i, size;
933     xmlChar ch;
934 
935     if (utf==NULL || utfchar==NULL) return -1;
936     size = xmlUTF8Strsize(utfchar, 1);
937         for(i=0; (ch=*utf) != 0; i++) {
938             if (xmlStrncmp(utf, utfchar, size)==0)
939                 return(i);
940             utf++;
941             if ( ch & 0x80 ) {
942                 /* if not simple ascii, verify proper format */
943                 if ( (ch & 0xc0) != 0xc0 )
944                     return(-1);
945                 /* then skip over remaining bytes for this char */
946                 while ( (ch <<= 1) & 0x80 )
947                     if ( (*utf++ & 0xc0) != 0x80 )
948                         return(-1);
949             }
950         }
951 
952     return(-1);
953 }
954 /**
955  * xmlUTF8Strsub:
956  * @utf:  a sequence of UTF-8 encoded bytes
957  * @start: relative pos of first char
958  * @len:   total number to copy
959  *
960  * Create a substring from a given UTF-8 string
961  * Note:  positions are given in units of UTF-8 chars
962  *
963  * Returns a pointer to a newly created string
964  * or NULL if any problem
965  */
966 
967 xmlChar *
xmlUTF8Strsub(const xmlChar * utf,int start,int len)968 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
969     int            i;
970     xmlChar ch;
971 
972     if (utf == NULL) return(NULL);
973     if (start < 0) return(NULL);
974     if (len < 0) return(NULL);
975 
976     /*
977      * Skip over any leading chars
978      */
979     for (i = 0;i < start;i++) {
980         if ((ch=*utf++) == 0) return(NULL);
981         if ( ch & 0x80 ) {
982             /* if not simple ascii, verify proper format */
983             if ( (ch & 0xc0) != 0xc0 )
984                 return(NULL);
985             /* then skip over remaining bytes for this char */
986             while ( (ch <<= 1) & 0x80 )
987                 if ( (*utf++ & 0xc0) != 0x80 )
988                     return(NULL);
989         }
990     }
991 
992     return(xmlUTF8Strndup(utf, len));
993 }
994 
995 /**
996  * xmlEscapeFormatString:
997  * @msg:  a pointer to the string in which to escape '%' characters.
998  * Must be a heap-allocated buffer created by libxml2 that may be
999  * returned, or that may be freed and replaced.
1000  *
1001  * Replaces the string pointed to by 'msg' with an escaped string.
1002  * Returns the same string with all '%' characters escaped.
1003  */
1004 xmlChar *
xmlEscapeFormatString(xmlChar ** msg)1005 xmlEscapeFormatString(xmlChar **msg)
1006 {
1007     xmlChar *msgPtr = NULL;
1008     xmlChar *result = NULL;
1009     xmlChar *resultPtr = NULL;
1010     size_t count = 0;
1011     size_t msgLen = 0;
1012     size_t resultLen = 0;
1013 
1014     if (!msg || !*msg)
1015         return(NULL);
1016 
1017     for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1018         ++msgLen;
1019         if (*msgPtr == '%')
1020             ++count;
1021     }
1022 
1023     if (count == 0)
1024         return(*msg);
1025 
1026     resultLen = msgLen + count + 1;
1027     result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1028     if (result == NULL) {
1029         /* Clear *msg to prevent format string vulnerabilities in
1030            out-of-memory situations. */
1031         xmlFree(*msg);
1032         *msg = NULL;
1033         xmlErrMemory(NULL, NULL);
1034         return(NULL);
1035     }
1036 
1037     for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1038         *resultPtr = *msgPtr;
1039         if (*msgPtr == '%')
1040             *(++resultPtr) = '%';
1041     }
1042     result[resultLen - 1] = '\0';
1043 
1044     xmlFree(*msg);
1045     *msg = result;
1046 
1047     return *msg;
1048 }
1049 
1050 #define bottom_xmlstring
1051 #include "elfgcchack.h"
1052