1 /* libxml2 - Library for parsing XML documents
2  * Copyright (C) 2006-2019 Free Software Foundation, Inc.
3  *
4  * This file is not part of the GNU gettext program, but is used with
5  * GNU gettext.
6  *
7  * The original copyright notice is as follows:
8  */
9 
10 /*
11  * Copyright (C) 1998-2012 Daniel Veillard.  All Rights Reserved.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this software and associated documentation files (the "Software"), to deal
15  * in the Software without restriction, including without limitation the rights
16  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17  * copies of the Software, and to permit persons to whom the Software is fur-
18  * nished to do so, subject to the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT-
25  * NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29  * THE SOFTWARE.
30  *
31  * UTF8 string routines from:
32  * William Brack <wbrack@mmm.com.hk>
33  *
34  * daniel@veillard.com
35  */
36 
37 /*
38  * string.c : an XML string utilities module
39  *
40  * This module provides various utility functions for manipulating
41  * the xmlChar* type. All functions named xmlStr* have been moved here
42  * from the parser.c file (their original home).
43  */
44 
45 #define IN_LIBXML
46 #include "libxml.h"
47 
48 #include <stdlib.h>
49 #include <string.h>
50 #include <libxml/xmlmemory.h>
51 #include <libxml/parserInternals.h>
52 #include <libxml/xmlstring.h>
53 
54 /************************************************************************
55  *                                                                      *
56  *                Commodity functions to handle xmlChars                *
57  *                                                                      *
58  ************************************************************************/
59 
60 /**
61  * xmlStrndup:
62  * @cur:  the input xmlChar *
63  * @len:  the len of @cur
64  *
65  * a strndup for array of xmlChar's
66  *
67  * Returns a new xmlChar * or NULL
68  */
69 xmlChar *
xmlStrndup(const xmlChar * cur,int len)70 xmlStrndup(const xmlChar *cur, int len) {
71     xmlChar *ret;
72 
73     if ((cur == NULL) || (len < 0)) return(NULL);
74     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
75     if (ret == NULL) {
76         xmlErrMemory(NULL, NULL);
77         return(NULL);
78     }
79     memcpy(ret, cur, len * sizeof(xmlChar));
80     ret[len] = 0;
81     return(ret);
82 }
83 
84 /**
85  * xmlStrdup:
86  * @cur:  the input xmlChar *
87  *
88  * a strdup for array of xmlChar's. Since they are supposed to be
89  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
90  * a termination mark of '0'.
91  *
92  * Returns a new xmlChar * or NULL
93  */
94 xmlChar *
xmlStrdup(const xmlChar * cur)95 xmlStrdup(const xmlChar *cur) {
96     const xmlChar *p = cur;
97 
98     if (cur == NULL) return(NULL);
99     while (*p != 0) p++; /* non input consuming */
100     return(xmlStrndup(cur, p - cur));
101 }
102 
103 /**
104  * xmlCharStrndup:
105  * @cur:  the input char *
106  * @len:  the len of @cur
107  *
108  * a strndup for char's to xmlChar's
109  *
110  * Returns a new xmlChar * or NULL
111  */
112 
113 xmlChar *
xmlCharStrndup(const char * cur,int len)114 xmlCharStrndup(const char *cur, int len) {
115     int i;
116     xmlChar *ret;
117 
118     if ((cur == NULL) || (len < 0)) return(NULL);
119     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
120     if (ret == NULL) {
121         xmlErrMemory(NULL, NULL);
122         return(NULL);
123     }
124     for (i = 0;i < len;i++) {
125         ret[i] = (xmlChar) cur[i];
126         if (ret[i] == 0) return(ret);
127     }
128     ret[len] = 0;
129     return(ret);
130 }
131 
132 /**
133  * xmlCharStrdup:
134  * @cur:  the input char *
135  *
136  * a strdup for char's to xmlChar's
137  *
138  * Returns a new xmlChar * or NULL
139  */
140 
141 xmlChar *
xmlCharStrdup(const char * cur)142 xmlCharStrdup(const char *cur) {
143     const char *p = cur;
144 
145     if (cur == NULL) return(NULL);
146     while (*p != '\0') p++; /* non input consuming */
147     return(xmlCharStrndup(cur, p - cur));
148 }
149 
150 /**
151  * xmlStrcmp:
152  * @str1:  the first xmlChar *
153  * @str2:  the second xmlChar *
154  *
155  * a strcmp for xmlChar's
156  *
157  * Returns the integer result of the comparison
158  */
159 
160 int
xmlStrcmp(const xmlChar * str1,const xmlChar * str2)161 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
162     register int tmp;
163 
164     if (str1 == str2) return(0);
165     if (str1 == NULL) return(-1);
166     if (str2 == NULL) return(1);
167     do {
168         tmp = *str1++ - *str2;
169         if (tmp != 0) return(tmp);
170     } while (*str2++ != 0);
171     return 0;
172 }
173 
174 /**
175  * xmlStrEqual:
176  * @str1:  the first xmlChar *
177  * @str2:  the second xmlChar *
178  *
179  * Check if both strings are equal of have same content.
180  * Should be a bit more readable and faster than xmlStrcmp()
181  *
182  * Returns 1 if they are equal, 0 if they are different
183  */
184 
185 int
xmlStrEqual(const xmlChar * str1,const xmlChar * str2)186 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
187     if (str1 == str2) return(1);
188     if (str1 == NULL) return(0);
189     if (str2 == NULL) return(0);
190     do {
191         if (*str1++ != *str2) return(0);
192     } while (*str2++);
193     return(1);
194 }
195 
196 /**
197  * xmlStrQEqual:
198  * @pref:  the prefix of the QName
199  * @name:  the localname of the QName
200  * @str:  the second xmlChar *
201  *
202  * Check if a QName is Equal to a given string
203  *
204  * Returns 1 if they are equal, 0 if they are different
205  */
206 
207 int
xmlStrQEqual(const xmlChar * pref,const xmlChar * name,const xmlChar * str)208 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
209     if (pref == NULL) return(xmlStrEqual(name, str));
210     if (name == NULL) return(0);
211     if (str == NULL) return(0);
212 
213     do {
214         if (*pref++ != *str) return(0);
215     } while ((*str++) && (*pref));
216     if (*str++ != ':') return(0);
217     do {
218         if (*name++ != *str) return(0);
219     } while (*str++);
220     return(1);
221 }
222 
223 /**
224  * xmlStrncmp:
225  * @str1:  the first xmlChar *
226  * @str2:  the second xmlChar *
227  * @len:  the max comparison length
228  *
229  * a strncmp for xmlChar's
230  *
231  * Returns the integer result of the comparison
232  */
233 
234 int
xmlStrncmp(const xmlChar * str1,const xmlChar * str2,int len)235 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
236     register int tmp;
237 
238     if (len <= 0) return(0);
239     if (str1 == str2) return(0);
240     if (str1 == NULL) return(-1);
241     if (str2 == NULL) return(1);
242 #ifdef __GNUC__
243     tmp = strncmp((const char *)str1, (const char *)str2, len);
244     return tmp;
245 #else
246     do {
247         tmp = *str1++ - *str2;
248         if (tmp != 0 || --len == 0) return(tmp);
249     } while (*str2++ != 0);
250     return 0;
251 #endif
252 }
253 
254 static const xmlChar casemap[256] = {
255     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
256     0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
257     0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
258     0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
259     0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
260     0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
261     0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
262     0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
263     0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
264     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
265     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
266     0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
267     0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
268     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
269     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
270     0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
271     0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
272     0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
273     0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
274     0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
275     0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
276     0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
277     0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
278     0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
279     0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
280     0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
281     0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
282     0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
283     0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
284     0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
285     0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
286     0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
287 };
288 
289 /**
290  * xmlStrcasecmp:
291  * @str1:  the first xmlChar *
292  * @str2:  the second xmlChar *
293  *
294  * a strcasecmp for xmlChar's
295  *
296  * Returns the integer result of the comparison
297  */
298 
299 int
xmlStrcasecmp(const xmlChar * str1,const xmlChar * str2)300 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
301     register int tmp;
302 
303     if (str1 == str2) return(0);
304     if (str1 == NULL) return(-1);
305     if (str2 == NULL) return(1);
306     do {
307         tmp = casemap[*str1++] - casemap[*str2];
308         if (tmp != 0) return(tmp);
309     } while (*str2++ != 0);
310     return 0;
311 }
312 
313 /**
314  * xmlStrncasecmp:
315  * @str1:  the first xmlChar *
316  * @str2:  the second xmlChar *
317  * @len:  the max comparison length
318  *
319  * a strncasecmp for xmlChar's
320  *
321  * Returns the integer result of the comparison
322  */
323 
324 int
xmlStrncasecmp(const xmlChar * str1,const xmlChar * str2,int len)325 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
326     register int tmp;
327 
328     if (len <= 0) return(0);
329     if (str1 == str2) return(0);
330     if (str1 == NULL) return(-1);
331     if (str2 == NULL) return(1);
332     do {
333         tmp = casemap[*str1++] - casemap[*str2];
334         if (tmp != 0 || --len == 0) return(tmp);
335     } while (*str2++ != 0);
336     return 0;
337 }
338 
339 /**
340  * xmlStrchr:
341  * @str:  the xmlChar * array
342  * @val:  the xmlChar to search
343  *
344  * a strchr for xmlChar's
345  *
346  * Returns the xmlChar * for the first occurrence or NULL.
347  */
348 
349 const xmlChar *
xmlStrchr(const xmlChar * str,xmlChar val)350 xmlStrchr(const xmlChar *str, xmlChar val) {
351     if (str == NULL) return(NULL);
352     while (*str != 0) { /* non input consuming */
353         if (*str == val) return((xmlChar *) str);
354         str++;
355     }
356     return(NULL);
357 }
358 
359 /**
360  * xmlStrstr:
361  * @str:  the xmlChar * array (haystack)
362  * @val:  the xmlChar to search (needle)
363  *
364  * a strstr for xmlChar's
365  *
366  * Returns the xmlChar * for the first occurrence or NULL.
367  */
368 
369 const xmlChar *
xmlStrstr(const xmlChar * str,const xmlChar * val)370 xmlStrstr(const xmlChar *str, const xmlChar *val) {
371     int n;
372 
373     if (str == NULL) return(NULL);
374     if (val == NULL) return(NULL);
375     n = xmlStrlen(val);
376 
377     if (n == 0) return(str);
378     while (*str != 0) { /* non input consuming */
379         if (*str == *val) {
380             if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
381         }
382         str++;
383     }
384     return(NULL);
385 }
386 
387 /**
388  * xmlStrcasestr:
389  * @str:  the xmlChar * array (haystack)
390  * @val:  the xmlChar to search (needle)
391  *
392  * a case-ignoring strstr for xmlChar's
393  *
394  * Returns the xmlChar * for the first occurrence or NULL.
395  */
396 
397 const xmlChar *
xmlStrcasestr(const xmlChar * str,const xmlChar * val)398 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
399     int n;
400 
401     if (str == NULL) return(NULL);
402     if (val == NULL) return(NULL);
403     n = xmlStrlen(val);
404 
405     if (n == 0) return(str);
406     while (*str != 0) { /* non input consuming */
407         if (casemap[*str] == casemap[*val])
408             if (!xmlStrncasecmp(str, val, n)) return(str);
409         str++;
410     }
411     return(NULL);
412 }
413 
414 /**
415  * xmlStrsub:
416  * @str:  the xmlChar * array (haystack)
417  * @start:  the index of the first char (zero based)
418  * @len:  the length of the substring
419  *
420  * Extract a substring of a given string
421  *
422  * Returns the xmlChar * for the first occurrence or NULL.
423  */
424 
425 xmlChar *
xmlStrsub(const xmlChar * str,int start,int len)426 xmlStrsub(const xmlChar *str, int start, int len) {
427     int i;
428 
429     if (str == NULL) return(NULL);
430     if (start < 0) return(NULL);
431     if (len < 0) return(NULL);
432 
433     for (i = 0;i < start;i++) {
434         if (*str == 0) return(NULL);
435         str++;
436     }
437     if (*str == 0) return(NULL);
438     return(xmlStrndup(str, len));
439 }
440 
441 /**
442  * xmlStrlen:
443  * @str:  the xmlChar * array
444  *
445  * length of a xmlChar's string
446  *
447  * Returns the number of xmlChar contained in the ARRAY.
448  */
449 
450 int
xmlStrlen(const xmlChar * str)451 xmlStrlen(const xmlChar *str) {
452     int len = 0;
453 
454     if (str == NULL) return(0);
455     while (*str != 0) { /* non input consuming */
456         str++;
457         len++;
458     }
459     return(len);
460 }
461 
462 /**
463  * xmlStrncat:
464  * @cur:  the original xmlChar * array
465  * @add:  the xmlChar * array added
466  * @len:  the length of @add
467  *
468  * a strncat for array of xmlChar's, it will extend @cur with the len
469  * first bytes of @add. Note that if @len < 0 then this is an API error
470  * and NULL will be returned.
471  *
472  * Returns a new xmlChar *, the original @cur is reallocated and should
473  * not be freed.
474  */
475 
476 xmlChar *
xmlStrncat(xmlChar * cur,const xmlChar * add,int len)477 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
478     int size;
479     xmlChar *ret;
480 
481     if ((add == NULL) || (len == 0))
482         return(cur);
483     if (len < 0)
484 	return(NULL);
485     if (cur == NULL)
486         return(xmlStrndup(add, len));
487 
488     size = xmlStrlen(cur);
489     if (size < 0)
490         return(NULL);
491     ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
492     if (ret == NULL) {
493         xmlErrMemory(NULL, NULL);
494         return(cur);
495     }
496     memcpy(&ret[size], add, len * sizeof(xmlChar));
497     ret[size + len] = 0;
498     return(ret);
499 }
500 
501 /**
502  * xmlStrncatNew:
503  * @str1:  first xmlChar string
504  * @str2:  second xmlChar string
505  * @len:  the len of @str2 or < 0
506  *
507  * same as xmlStrncat, but creates a new string.  The original
508  * two strings are not freed. If @len is < 0 then the length
509  * will be calculated automatically.
510  *
511  * Returns a new xmlChar * or NULL
512  */
513 xmlChar *
xmlStrncatNew(const xmlChar * str1,const xmlChar * str2,int len)514 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
515     int size;
516     xmlChar *ret;
517 
518     if (len < 0) {
519         len = xmlStrlen(str2);
520         if (len < 0)
521             return(NULL);
522     }
523     if ((str2 == NULL) || (len == 0))
524         return(xmlStrdup(str1));
525     if (str1 == NULL)
526         return(xmlStrndup(str2, len));
527 
528     size = xmlStrlen(str1);
529     if (size < 0)
530         return(NULL);
531     ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
532     if (ret == NULL) {
533         xmlErrMemory(NULL, NULL);
534         return(xmlStrndup(str1, size));
535     }
536     memcpy(ret, str1, size * sizeof(xmlChar));
537     memcpy(&ret[size], str2, len * sizeof(xmlChar));
538     ret[size + len] = 0;
539     return(ret);
540 }
541 
542 /**
543  * xmlStrcat:
544  * @cur:  the original xmlChar * array
545  * @add:  the xmlChar * array added
546  *
547  * a strcat for array of xmlChar's. Since they are supposed to be
548  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
549  * a termination mark of '0'.
550  *
551  * Returns a new xmlChar * containing the concatenated string. The original
552  * @cur is reallocated and should not be freed.
553  */
554 xmlChar *
xmlStrcat(xmlChar * cur,const xmlChar * add)555 xmlStrcat(xmlChar *cur, const xmlChar *add) {
556     const xmlChar *p = add;
557 
558     if (add == NULL) return(cur);
559     if (cur == NULL)
560         return(xmlStrdup(add));
561 
562     while (*p != 0) p++; /* non input consuming */
563     return(xmlStrncat(cur, add, p - add));
564 }
565 
566 /**
567  * xmlStrPrintf:
568  * @buf:   the result buffer.
569  * @len:   the result buffer length.
570  * @msg:   the message with printf formatting.
571  * @...:   extra parameters for the message.
572  *
573  * Formats @msg and places result into @buf.
574  *
575  * Returns the number of characters written to @buf or -1 if an error occurs.
576  */
577 int XMLCDECL
xmlStrPrintf(xmlChar * buf,int len,const char * msg,...)578 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
579     va_list args;
580     int ret;
581 
582     if((buf == NULL) || (msg == NULL)) {
583         return(-1);
584     }
585 
586     va_start(args, msg);
587     ret = vsnprintf((char *) buf, len, (const char *) msg, args);
588     va_end(args);
589     buf[len - 1] = 0; /* be safe ! */
590 
591     return(ret);
592 }
593 
594 /**
595  * xmlStrVPrintf:
596  * @buf:   the result buffer.
597  * @len:   the result buffer length.
598  * @msg:   the message with printf formatting.
599  * @ap:    extra parameters for the message.
600  *
601  * Formats @msg and places result into @buf.
602  *
603  * Returns the number of characters written to @buf or -1 if an error occurs.
604  */
605 int
xmlStrVPrintf(xmlChar * buf,int len,const char * msg,va_list ap)606 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
607     int ret;
608 
609     if((buf == NULL) || (msg == NULL)) {
610         return(-1);
611     }
612 
613     ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
614     buf[len - 1] = 0; /* be safe ! */
615 
616     return(ret);
617 }
618 
619 /************************************************************************
620  *                                                                      *
621  *              Generic UTF8 handling routines                          *
622  *                                                                      *
623  * From rfc2044: encoding of the Unicode values on UTF-8:               *
624  *                                                                      *
625  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
626  * 0000 0000-0000 007F   0xxxxxxx                                       *
627  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
628  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
629  *                                                                      *
630  * I hope we won't use values > 0xFFFF anytime soon !                   *
631  *                                                                      *
632  ************************************************************************/
633 
634 
635 /**
636  * xmlUTF8Size:
637  * @utf: pointer to the UTF8 character
638  *
639  * calculates the internal size of a UTF8 character
640  *
641  * returns the numbers of bytes in the character, -1 on format error
642  */
643 int
xmlUTF8Size(const xmlChar * utf)644 xmlUTF8Size(const xmlChar *utf) {
645     xmlChar mask;
646     int len;
647 
648     if (utf == NULL)
649         return -1;
650     if (*utf < 0x80)
651         return 1;
652     /* check valid UTF8 character */
653     if (!(*utf & 0x40))
654         return -1;
655     /* determine number of bytes in char */
656     len = 2;
657     for (mask=0x20; mask != 0; mask>>=1) {
658         if (!(*utf & mask))
659             return len;
660         len++;
661     }
662     return -1;
663 }
664 
665 /**
666  * xmlUTF8Charcmp:
667  * @utf1: pointer to first UTF8 char
668  * @utf2: pointer to second UTF8 char
669  *
670  * compares the two UCS4 values
671  *
672  * returns result of the compare as with xmlStrncmp
673  */
674 int
xmlUTF8Charcmp(const xmlChar * utf1,const xmlChar * utf2)675 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
676 
677     if (utf1 == NULL ) {
678         if (utf2 == NULL)
679             return 0;
680         return -1;
681     }
682     return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
683 }
684 
685 /**
686  * xmlUTF8Strlen:
687  * @utf:  a sequence of UTF-8 encoded bytes
688  *
689  * compute the length of an UTF8 string, it doesn't do a full UTF8
690  * checking of the content of the string.
691  *
692  * Returns the number of characters in the string or -1 in case of error
693  */
694 int
xmlUTF8Strlen(const xmlChar * utf)695 xmlUTF8Strlen(const xmlChar *utf) {
696     int ret = 0;
697 
698     if (utf == NULL)
699         return(-1);
700 
701     while (*utf != 0) {
702         if (utf[0] & 0x80) {
703             if ((utf[1] & 0xc0) != 0x80)
704                 return(-1);
705             if ((utf[0] & 0xe0) == 0xe0) {
706                 if ((utf[2] & 0xc0) != 0x80)
707                     return(-1);
708                 if ((utf[0] & 0xf0) == 0xf0) {
709                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
710                         return(-1);
711                     utf += 4;
712                 } else {
713                     utf += 3;
714                 }
715             } else {
716                 utf += 2;
717             }
718         } else {
719             utf++;
720         }
721         ret++;
722     }
723     return(ret);
724 }
725 
726 /**
727  * xmlGetUTF8Char:
728  * @utf:  a sequence of UTF-8 encoded bytes
729  * @len:  a pointer to the minimum number of bytes present in
730  *        the sequence.  This is used to assure the next character
731  *        is completely contained within the sequence.
732  *
733  * Read the first UTF8 character from @utf
734  *
735  * Returns the char value or -1 in case of error, and sets *len to
736  *        the actual number of bytes consumed (0 in case of error)
737  */
738 int
xmlGetUTF8Char(const unsigned char * utf,int * len)739 xmlGetUTF8Char(const unsigned char *utf, int *len) {
740     unsigned int c;
741 
742     if (utf == NULL)
743         goto error;
744     if (len == NULL)
745         goto error;
746     if (*len < 1)
747         goto error;
748 
749     c = utf[0];
750     if (c & 0x80) {
751         if (*len < 2)
752             goto error;
753         if ((utf[1] & 0xc0) != 0x80)
754             goto error;
755         if ((c & 0xe0) == 0xe0) {
756             if (*len < 3)
757                 goto error;
758             if ((utf[2] & 0xc0) != 0x80)
759                 goto error;
760             if ((c & 0xf0) == 0xf0) {
761                 if (*len < 4)
762                     goto error;
763                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
764                     goto error;
765                 *len = 4;
766                 /* 4-byte code */
767                 c = (utf[0] & 0x7) << 18;
768                 c |= (utf[1] & 0x3f) << 12;
769                 c |= (utf[2] & 0x3f) << 6;
770                 c |= utf[3] & 0x3f;
771             } else {
772               /* 3-byte code */
773                 *len = 3;
774                 c = (utf[0] & 0xf) << 12;
775                 c |= (utf[1] & 0x3f) << 6;
776                 c |= utf[2] & 0x3f;
777             }
778         } else {
779           /* 2-byte code */
780             *len = 2;
781             c = (utf[0] & 0x1f) << 6;
782             c |= utf[1] & 0x3f;
783         }
784     } else {
785         /* 1-byte code */
786         *len = 1;
787     }
788     return(c);
789 
790 error:
791     if (len != NULL)
792 	*len = 0;
793     return(-1);
794 }
795 
796 /**
797  * xmlCheckUTF8:
798  * @utf: Pointer to putative UTF-8 encoded string.
799  *
800  * Checks @utf for being valid UTF-8. @utf is assumed to be
801  * null-terminated. This function is not super-strict, as it will
802  * allow longer UTF-8 sequences than necessary. Note that Java is
803  * capable of producing these sequences if provoked. Also note, this
804  * routine checks for the 4-byte maximum size, but does not check for
805  * 0x10ffff maximum value.
806  *
807  * Return value: true if @utf is valid.
808  **/
809 int
xmlCheckUTF8(const unsigned char * utf)810 xmlCheckUTF8(const unsigned char *utf)
811 {
812     int ix;
813     unsigned char c;
814 
815     if (utf == NULL)
816         return(0);
817     /*
818      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
819      * are as follows (in "bit format"):
820      *    0xxxxxxx                                      valid 1-byte
821      *    110xxxxx 10xxxxxx                             valid 2-byte
822      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
823      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
824      */
825     for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
826         if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
827             ix++;
828 	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
829 	    if ((utf[ix+1] & 0xc0 ) != 0x80)
830 	        return 0;
831 	    ix += 2;
832 	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
833 	    if (((utf[ix+1] & 0xc0) != 0x80) ||
834 	        ((utf[ix+2] & 0xc0) != 0x80))
835 		    return 0;
836 	    ix += 3;
837 	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
838 	    if (((utf[ix+1] & 0xc0) != 0x80) ||
839 	        ((utf[ix+2] & 0xc0) != 0x80) ||
840 		((utf[ix+3] & 0xc0) != 0x80))
841 		    return 0;
842 	    ix += 4;
843 	} else				/* unknown encoding */
844 	    return 0;
845       }
846       return(1);
847 }
848 
849 /**
850  * xmlUTF8Strsize:
851  * @utf:  a sequence of UTF-8 encoded bytes
852  * @len:  the number of characters in the array
853  *
854  * storage size of an UTF8 string
855  * the behaviour is not guaranteed if the input string is not UTF-8
856  *
857  * Returns the storage size of
858  * the first 'len' characters of ARRAY
859  */
860 
861 int
xmlUTF8Strsize(const xmlChar * utf,int len)862 xmlUTF8Strsize(const xmlChar *utf, int len) {
863     const xmlChar   *ptr=utf;
864     xmlChar         ch;
865 
866     if (utf == NULL)
867         return(0);
868 
869     if (len <= 0)
870         return(0);
871 
872     while ( len-- > 0) {
873         if ( !*ptr )
874             break;
875         if ( (ch = *ptr++) & 0x80)
876             while ((ch<<=1) & 0x80 ) {
877 		if (*ptr == 0) break;
878                 ptr++;
879 	    }
880     }
881     return (ptr - utf);
882 }
883 
884 
885 /**
886  * xmlUTF8Strndup:
887  * @utf:  the input UTF8 *
888  * @len:  the len of @utf (in chars)
889  *
890  * a strndup for array of UTF8's
891  *
892  * Returns a new UTF8 * or NULL
893  */
894 xmlChar *
xmlUTF8Strndup(const xmlChar * utf,int len)895 xmlUTF8Strndup(const xmlChar *utf, int len) {
896     xmlChar *ret;
897     int i;
898 
899     if ((utf == NULL) || (len < 0)) return(NULL);
900     i = xmlUTF8Strsize(utf, len);
901     ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
902     if (ret == NULL) {
903         xmlGenericError(xmlGenericErrorContext,
904                 "malloc of %ld byte failed\n",
905                 (len + 1) * (long)sizeof(xmlChar));
906         return(NULL);
907     }
908     memcpy(ret, utf, i * sizeof(xmlChar));
909     ret[i] = 0;
910     return(ret);
911 }
912 
913 /**
914  * xmlUTF8Strpos:
915  * @utf:  the input UTF8 *
916  * @pos:  the position of the desired UTF8 char (in chars)
917  *
918  * a function to provide the equivalent of fetching a
919  * character from a string array
920  *
921  * Returns a pointer to the UTF8 character or NULL
922  */
923 const xmlChar *
xmlUTF8Strpos(const xmlChar * utf,int pos)924 xmlUTF8Strpos(const xmlChar *utf, int pos) {
925     xmlChar ch;
926 
927     if (utf == NULL) return(NULL);
928     if (pos < 0)
929         return(NULL);
930     while (pos--) {
931         if ((ch=*utf++) == 0) return(NULL);
932         if ( ch & 0x80 ) {
933             /* if not simple ascii, verify proper format */
934             if ( (ch & 0xc0) != 0xc0 )
935                 return(NULL);
936             /* then skip over remaining bytes for this char */
937             while ( (ch <<= 1) & 0x80 )
938                 if ( (*utf++ & 0xc0) != 0x80 )
939                     return(NULL);
940         }
941     }
942     return((xmlChar *)utf);
943 }
944 
945 /**
946  * xmlUTF8Strloc:
947  * @utf:  the input UTF8 *
948  * @utfchar:  the UTF8 character to be found
949  *
950  * a function to provide the relative location of a UTF8 char
951  *
952  * Returns the relative character position of the desired char
953  * or -1 if not found
954  */
955 int
xmlUTF8Strloc(const xmlChar * utf,const xmlChar * utfchar)956 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
957     int i, size;
958     xmlChar ch;
959 
960     if (utf==NULL || utfchar==NULL) return -1;
961     size = xmlUTF8Strsize(utfchar, 1);
962         for(i=0; (ch=*utf) != 0; i++) {
963             if (xmlStrncmp(utf, utfchar, size)==0)
964                 return(i);
965             utf++;
966             if ( ch & 0x80 ) {
967                 /* if not simple ascii, verify proper format */
968                 if ( (ch & 0xc0) != 0xc0 )
969                     return(-1);
970                 /* then skip over remaining bytes for this char */
971                 while ( (ch <<= 1) & 0x80 )
972                     if ( (*utf++ & 0xc0) != 0x80 )
973                         return(-1);
974             }
975         }
976 
977     return(-1);
978 }
979 /**
980  * xmlUTF8Strsub:
981  * @utf:  a sequence of UTF-8 encoded bytes
982  * @start: relative pos of first char
983  * @len:   total number to copy
984  *
985  * Create a substring from a given UTF-8 string
986  * Note:  positions are given in units of UTF-8 chars
987  *
988  * Returns a pointer to a newly created string
989  * or NULL if any problem
990  */
991 
992 xmlChar *
xmlUTF8Strsub(const xmlChar * utf,int start,int len)993 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
994     int            i;
995     xmlChar ch;
996 
997     if (utf == NULL) return(NULL);
998     if (start < 0) return(NULL);
999     if (len < 0) return(NULL);
1000 
1001     /*
1002      * Skip over any leading chars
1003      */
1004     for (i = 0;i < start;i++) {
1005         if ((ch=*utf++) == 0) return(NULL);
1006         if ( ch & 0x80 ) {
1007             /* if not simple ascii, verify proper format */
1008             if ( (ch & 0xc0) != 0xc0 )
1009                 return(NULL);
1010             /* then skip over remaining bytes for this char */
1011             while ( (ch <<= 1) & 0x80 )
1012                 if ( (*utf++ & 0xc0) != 0x80 )
1013                     return(NULL);
1014         }
1015     }
1016 
1017     return(xmlUTF8Strndup(utf, len));
1018 }
1019 
1020 /**
1021  * xmlEscapeFormatString:
1022  * @msg:  a pointer to the string in which to escape '%' characters.
1023  * Must be a heap-allocated buffer created by libxml2 that may be
1024  * returned, or that may be freed and replaced.
1025  *
1026  * Replaces the string pointed to by 'msg' with an escaped string.
1027  * Returns the same string with all '%' characters escaped.
1028  */
1029 xmlChar *
xmlEscapeFormatString(xmlChar ** msg)1030 xmlEscapeFormatString(xmlChar **msg)
1031 {
1032     xmlChar *msgPtr = NULL;
1033     xmlChar *result = NULL;
1034     xmlChar *resultPtr = NULL;
1035     size_t count = 0;
1036     size_t msgLen = 0;
1037     size_t resultLen = 0;
1038 
1039     if (!msg || !*msg)
1040         return(NULL);
1041 
1042     for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1043         ++msgLen;
1044         if (*msgPtr == '%')
1045             ++count;
1046     }
1047 
1048     if (count == 0)
1049         return(*msg);
1050 
1051     resultLen = msgLen + count + 1;
1052     result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1053     if (result == NULL) {
1054         /* Clear *msg to prevent format string vulnerabilities in
1055            out-of-memory situations. */
1056         xmlFree(*msg);
1057         *msg = NULL;
1058         xmlErrMemory(NULL, NULL);
1059         return(NULL);
1060     }
1061 
1062     for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1063         *resultPtr = *msgPtr;
1064         if (*msgPtr == '%')
1065             *(++resultPtr) = '%';
1066     }
1067     result[resultLen - 1] = '\0';
1068 
1069     xmlFree(*msg);
1070     *msg = result;
1071 
1072     return *msg;
1073 }
1074 
1075 #define bottom_xmlstring
1076 #include "elfgcchack.h"
1077