1 /* libxml2 - Library for parsing XML documents
2 * Copyright (C) 2006-2019 Free Software Foundation, Inc.
3 *
4 * This file is not part of the GNU gettext program, but is used with
5 * GNU gettext.
6 *
7 * The original copyright notice is as follows:
8 */
9
10 /*
11 * Copyright (C) 1998-2012 Daniel Veillard. All Rights Reserved.
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining a copy
14 * of this software and associated documentation files (the "Software"), to deal
15 * in the Software without restriction, including without limitation the rights
16 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 * copies of the Software, and to permit persons to whom the Software is fur-
18 * nished to do so, subject to the following conditions:
19 *
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT-
25 * NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 * THE SOFTWARE.
30 *
31 * UTF8 string routines from:
32 * William Brack <wbrack@mmm.com.hk>
33 *
34 * daniel@veillard.com
35 */
36
37 /*
38 * string.c : an XML string utilities module
39 *
40 * This module provides various utility functions for manipulating
41 * the xmlChar* type. All functions named xmlStr* have been moved here
42 * from the parser.c file (their original home).
43 */
44
45 #define IN_LIBXML
46 #include "libxml.h"
47
48 #include <stdlib.h>
49 #include <string.h>
50 #include <libxml/xmlmemory.h>
51 #include <libxml/parserInternals.h>
52 #include <libxml/xmlstring.h>
53
54 /************************************************************************
55 * *
56 * Commodity functions to handle xmlChars *
57 * *
58 ************************************************************************/
59
60 /**
61 * xmlStrndup:
62 * @cur: the input xmlChar *
63 * @len: the len of @cur
64 *
65 * a strndup for array of xmlChar's
66 *
67 * Returns a new xmlChar * or NULL
68 */
69 xmlChar *
xmlStrndup(const xmlChar * cur,int len)70 xmlStrndup(const xmlChar *cur, int len) {
71 xmlChar *ret;
72
73 if ((cur == NULL) || (len < 0)) return(NULL);
74 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
75 if (ret == NULL) {
76 xmlErrMemory(NULL, NULL);
77 return(NULL);
78 }
79 memcpy(ret, cur, len * sizeof(xmlChar));
80 ret[len] = 0;
81 return(ret);
82 }
83
84 /**
85 * xmlStrdup:
86 * @cur: the input xmlChar *
87 *
88 * a strdup for array of xmlChar's. Since they are supposed to be
89 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
90 * a termination mark of '0'.
91 *
92 * Returns a new xmlChar * or NULL
93 */
94 xmlChar *
xmlStrdup(const xmlChar * cur)95 xmlStrdup(const xmlChar *cur) {
96 const xmlChar *p = cur;
97
98 if (cur == NULL) return(NULL);
99 while (*p != 0) p++; /* non input consuming */
100 return(xmlStrndup(cur, p - cur));
101 }
102
103 /**
104 * xmlCharStrndup:
105 * @cur: the input char *
106 * @len: the len of @cur
107 *
108 * a strndup for char's to xmlChar's
109 *
110 * Returns a new xmlChar * or NULL
111 */
112
113 xmlChar *
xmlCharStrndup(const char * cur,int len)114 xmlCharStrndup(const char *cur, int len) {
115 int i;
116 xmlChar *ret;
117
118 if ((cur == NULL) || (len < 0)) return(NULL);
119 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
120 if (ret == NULL) {
121 xmlErrMemory(NULL, NULL);
122 return(NULL);
123 }
124 for (i = 0;i < len;i++) {
125 ret[i] = (xmlChar) cur[i];
126 if (ret[i] == 0) return(ret);
127 }
128 ret[len] = 0;
129 return(ret);
130 }
131
132 /**
133 * xmlCharStrdup:
134 * @cur: the input char *
135 *
136 * a strdup for char's to xmlChar's
137 *
138 * Returns a new xmlChar * or NULL
139 */
140
141 xmlChar *
xmlCharStrdup(const char * cur)142 xmlCharStrdup(const char *cur) {
143 const char *p = cur;
144
145 if (cur == NULL) return(NULL);
146 while (*p != '\0') p++; /* non input consuming */
147 return(xmlCharStrndup(cur, p - cur));
148 }
149
150 /**
151 * xmlStrcmp:
152 * @str1: the first xmlChar *
153 * @str2: the second xmlChar *
154 *
155 * a strcmp for xmlChar's
156 *
157 * Returns the integer result of the comparison
158 */
159
160 int
xmlStrcmp(const xmlChar * str1,const xmlChar * str2)161 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
162 register int tmp;
163
164 if (str1 == str2) return(0);
165 if (str1 == NULL) return(-1);
166 if (str2 == NULL) return(1);
167 do {
168 tmp = *str1++ - *str2;
169 if (tmp != 0) return(tmp);
170 } while (*str2++ != 0);
171 return 0;
172 }
173
174 /**
175 * xmlStrEqual:
176 * @str1: the first xmlChar *
177 * @str2: the second xmlChar *
178 *
179 * Check if both strings are equal of have same content.
180 * Should be a bit more readable and faster than xmlStrcmp()
181 *
182 * Returns 1 if they are equal, 0 if they are different
183 */
184
185 int
xmlStrEqual(const xmlChar * str1,const xmlChar * str2)186 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
187 if (str1 == str2) return(1);
188 if (str1 == NULL) return(0);
189 if (str2 == NULL) return(0);
190 do {
191 if (*str1++ != *str2) return(0);
192 } while (*str2++);
193 return(1);
194 }
195
196 /**
197 * xmlStrQEqual:
198 * @pref: the prefix of the QName
199 * @name: the localname of the QName
200 * @str: the second xmlChar *
201 *
202 * Check if a QName is Equal to a given string
203 *
204 * Returns 1 if they are equal, 0 if they are different
205 */
206
207 int
xmlStrQEqual(const xmlChar * pref,const xmlChar * name,const xmlChar * str)208 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
209 if (pref == NULL) return(xmlStrEqual(name, str));
210 if (name == NULL) return(0);
211 if (str == NULL) return(0);
212
213 do {
214 if (*pref++ != *str) return(0);
215 } while ((*str++) && (*pref));
216 if (*str++ != ':') return(0);
217 do {
218 if (*name++ != *str) return(0);
219 } while (*str++);
220 return(1);
221 }
222
223 /**
224 * xmlStrncmp:
225 * @str1: the first xmlChar *
226 * @str2: the second xmlChar *
227 * @len: the max comparison length
228 *
229 * a strncmp for xmlChar's
230 *
231 * Returns the integer result of the comparison
232 */
233
234 int
xmlStrncmp(const xmlChar * str1,const xmlChar * str2,int len)235 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
236 register int tmp;
237
238 if (len <= 0) return(0);
239 if (str1 == str2) return(0);
240 if (str1 == NULL) return(-1);
241 if (str2 == NULL) return(1);
242 #ifdef __GNUC__
243 tmp = strncmp((const char *)str1, (const char *)str2, len);
244 return tmp;
245 #else
246 do {
247 tmp = *str1++ - *str2;
248 if (tmp != 0 || --len == 0) return(tmp);
249 } while (*str2++ != 0);
250 return 0;
251 #endif
252 }
253
254 static const xmlChar casemap[256] = {
255 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
256 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
257 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
258 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
259 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
260 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
261 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
262 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
263 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
264 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
265 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
266 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
267 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
268 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
269 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
270 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
271 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
272 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
273 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
274 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
275 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
276 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
277 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
278 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
279 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
280 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
281 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
282 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
283 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
284 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
285 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
286 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
287 };
288
289 /**
290 * xmlStrcasecmp:
291 * @str1: the first xmlChar *
292 * @str2: the second xmlChar *
293 *
294 * a strcasecmp for xmlChar's
295 *
296 * Returns the integer result of the comparison
297 */
298
299 int
xmlStrcasecmp(const xmlChar * str1,const xmlChar * str2)300 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
301 register int tmp;
302
303 if (str1 == str2) return(0);
304 if (str1 == NULL) return(-1);
305 if (str2 == NULL) return(1);
306 do {
307 tmp = casemap[*str1++] - casemap[*str2];
308 if (tmp != 0) return(tmp);
309 } while (*str2++ != 0);
310 return 0;
311 }
312
313 /**
314 * xmlStrncasecmp:
315 * @str1: the first xmlChar *
316 * @str2: the second xmlChar *
317 * @len: the max comparison length
318 *
319 * a strncasecmp for xmlChar's
320 *
321 * Returns the integer result of the comparison
322 */
323
324 int
xmlStrncasecmp(const xmlChar * str1,const xmlChar * str2,int len)325 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
326 register int tmp;
327
328 if (len <= 0) return(0);
329 if (str1 == str2) return(0);
330 if (str1 == NULL) return(-1);
331 if (str2 == NULL) return(1);
332 do {
333 tmp = casemap[*str1++] - casemap[*str2];
334 if (tmp != 0 || --len == 0) return(tmp);
335 } while (*str2++ != 0);
336 return 0;
337 }
338
339 /**
340 * xmlStrchr:
341 * @str: the xmlChar * array
342 * @val: the xmlChar to search
343 *
344 * a strchr for xmlChar's
345 *
346 * Returns the xmlChar * for the first occurrence or NULL.
347 */
348
349 const xmlChar *
xmlStrchr(const xmlChar * str,xmlChar val)350 xmlStrchr(const xmlChar *str, xmlChar val) {
351 if (str == NULL) return(NULL);
352 while (*str != 0) { /* non input consuming */
353 if (*str == val) return((xmlChar *) str);
354 str++;
355 }
356 return(NULL);
357 }
358
359 /**
360 * xmlStrstr:
361 * @str: the xmlChar * array (haystack)
362 * @val: the xmlChar to search (needle)
363 *
364 * a strstr for xmlChar's
365 *
366 * Returns the xmlChar * for the first occurrence or NULL.
367 */
368
369 const xmlChar *
xmlStrstr(const xmlChar * str,const xmlChar * val)370 xmlStrstr(const xmlChar *str, const xmlChar *val) {
371 int n;
372
373 if (str == NULL) return(NULL);
374 if (val == NULL) return(NULL);
375 n = xmlStrlen(val);
376
377 if (n == 0) return(str);
378 while (*str != 0) { /* non input consuming */
379 if (*str == *val) {
380 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
381 }
382 str++;
383 }
384 return(NULL);
385 }
386
387 /**
388 * xmlStrcasestr:
389 * @str: the xmlChar * array (haystack)
390 * @val: the xmlChar to search (needle)
391 *
392 * a case-ignoring strstr for xmlChar's
393 *
394 * Returns the xmlChar * for the first occurrence or NULL.
395 */
396
397 const xmlChar *
xmlStrcasestr(const xmlChar * str,const xmlChar * val)398 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
399 int n;
400
401 if (str == NULL) return(NULL);
402 if (val == NULL) return(NULL);
403 n = xmlStrlen(val);
404
405 if (n == 0) return(str);
406 while (*str != 0) { /* non input consuming */
407 if (casemap[*str] == casemap[*val])
408 if (!xmlStrncasecmp(str, val, n)) return(str);
409 str++;
410 }
411 return(NULL);
412 }
413
414 /**
415 * xmlStrsub:
416 * @str: the xmlChar * array (haystack)
417 * @start: the index of the first char (zero based)
418 * @len: the length of the substring
419 *
420 * Extract a substring of a given string
421 *
422 * Returns the xmlChar * for the first occurrence or NULL.
423 */
424
425 xmlChar *
xmlStrsub(const xmlChar * str,int start,int len)426 xmlStrsub(const xmlChar *str, int start, int len) {
427 int i;
428
429 if (str == NULL) return(NULL);
430 if (start < 0) return(NULL);
431 if (len < 0) return(NULL);
432
433 for (i = 0;i < start;i++) {
434 if (*str == 0) return(NULL);
435 str++;
436 }
437 if (*str == 0) return(NULL);
438 return(xmlStrndup(str, len));
439 }
440
441 /**
442 * xmlStrlen:
443 * @str: the xmlChar * array
444 *
445 * length of a xmlChar's string
446 *
447 * Returns the number of xmlChar contained in the ARRAY.
448 */
449
450 int
xmlStrlen(const xmlChar * str)451 xmlStrlen(const xmlChar *str) {
452 int len = 0;
453
454 if (str == NULL) return(0);
455 while (*str != 0) { /* non input consuming */
456 str++;
457 len++;
458 }
459 return(len);
460 }
461
462 /**
463 * xmlStrncat:
464 * @cur: the original xmlChar * array
465 * @add: the xmlChar * array added
466 * @len: the length of @add
467 *
468 * a strncat for array of xmlChar's, it will extend @cur with the len
469 * first bytes of @add. Note that if @len < 0 then this is an API error
470 * and NULL will be returned.
471 *
472 * Returns a new xmlChar *, the original @cur is reallocated and should
473 * not be freed.
474 */
475
476 xmlChar *
xmlStrncat(xmlChar * cur,const xmlChar * add,int len)477 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
478 int size;
479 xmlChar *ret;
480
481 if ((add == NULL) || (len == 0))
482 return(cur);
483 if (len < 0)
484 return(NULL);
485 if (cur == NULL)
486 return(xmlStrndup(add, len));
487
488 size = xmlStrlen(cur);
489 if (size < 0)
490 return(NULL);
491 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
492 if (ret == NULL) {
493 xmlErrMemory(NULL, NULL);
494 return(cur);
495 }
496 memcpy(&ret[size], add, len * sizeof(xmlChar));
497 ret[size + len] = 0;
498 return(ret);
499 }
500
501 /**
502 * xmlStrncatNew:
503 * @str1: first xmlChar string
504 * @str2: second xmlChar string
505 * @len: the len of @str2 or < 0
506 *
507 * same as xmlStrncat, but creates a new string. The original
508 * two strings are not freed. If @len is < 0 then the length
509 * will be calculated automatically.
510 *
511 * Returns a new xmlChar * or NULL
512 */
513 xmlChar *
xmlStrncatNew(const xmlChar * str1,const xmlChar * str2,int len)514 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
515 int size;
516 xmlChar *ret;
517
518 if (len < 0) {
519 len = xmlStrlen(str2);
520 if (len < 0)
521 return(NULL);
522 }
523 if ((str2 == NULL) || (len == 0))
524 return(xmlStrdup(str1));
525 if (str1 == NULL)
526 return(xmlStrndup(str2, len));
527
528 size = xmlStrlen(str1);
529 if (size < 0)
530 return(NULL);
531 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
532 if (ret == NULL) {
533 xmlErrMemory(NULL, NULL);
534 return(xmlStrndup(str1, size));
535 }
536 memcpy(ret, str1, size * sizeof(xmlChar));
537 memcpy(&ret[size], str2, len * sizeof(xmlChar));
538 ret[size + len] = 0;
539 return(ret);
540 }
541
542 /**
543 * xmlStrcat:
544 * @cur: the original xmlChar * array
545 * @add: the xmlChar * array added
546 *
547 * a strcat for array of xmlChar's. Since they are supposed to be
548 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
549 * a termination mark of '0'.
550 *
551 * Returns a new xmlChar * containing the concatenated string. The original
552 * @cur is reallocated and should not be freed.
553 */
554 xmlChar *
xmlStrcat(xmlChar * cur,const xmlChar * add)555 xmlStrcat(xmlChar *cur, const xmlChar *add) {
556 const xmlChar *p = add;
557
558 if (add == NULL) return(cur);
559 if (cur == NULL)
560 return(xmlStrdup(add));
561
562 while (*p != 0) p++; /* non input consuming */
563 return(xmlStrncat(cur, add, p - add));
564 }
565
566 /**
567 * xmlStrPrintf:
568 * @buf: the result buffer.
569 * @len: the result buffer length.
570 * @msg: the message with printf formatting.
571 * @...: extra parameters for the message.
572 *
573 * Formats @msg and places result into @buf.
574 *
575 * Returns the number of characters written to @buf or -1 if an error occurs.
576 */
577 int XMLCDECL
xmlStrPrintf(xmlChar * buf,int len,const char * msg,...)578 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
579 va_list args;
580 int ret;
581
582 if((buf == NULL) || (msg == NULL)) {
583 return(-1);
584 }
585
586 va_start(args, msg);
587 ret = vsnprintf((char *) buf, len, (const char *) msg, args);
588 va_end(args);
589 buf[len - 1] = 0; /* be safe ! */
590
591 return(ret);
592 }
593
594 /**
595 * xmlStrVPrintf:
596 * @buf: the result buffer.
597 * @len: the result buffer length.
598 * @msg: the message with printf formatting.
599 * @ap: extra parameters for the message.
600 *
601 * Formats @msg and places result into @buf.
602 *
603 * Returns the number of characters written to @buf or -1 if an error occurs.
604 */
605 int
xmlStrVPrintf(xmlChar * buf,int len,const char * msg,va_list ap)606 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
607 int ret;
608
609 if((buf == NULL) || (msg == NULL)) {
610 return(-1);
611 }
612
613 ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
614 buf[len - 1] = 0; /* be safe ! */
615
616 return(ret);
617 }
618
619 /************************************************************************
620 * *
621 * Generic UTF8 handling routines *
622 * *
623 * From rfc2044: encoding of the Unicode values on UTF-8: *
624 * *
625 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
626 * 0000 0000-0000 007F 0xxxxxxx *
627 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
628 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
629 * *
630 * I hope we won't use values > 0xFFFF anytime soon ! *
631 * *
632 ************************************************************************/
633
634
635 /**
636 * xmlUTF8Size:
637 * @utf: pointer to the UTF8 character
638 *
639 * calculates the internal size of a UTF8 character
640 *
641 * returns the numbers of bytes in the character, -1 on format error
642 */
643 int
xmlUTF8Size(const xmlChar * utf)644 xmlUTF8Size(const xmlChar *utf) {
645 xmlChar mask;
646 int len;
647
648 if (utf == NULL)
649 return -1;
650 if (*utf < 0x80)
651 return 1;
652 /* check valid UTF8 character */
653 if (!(*utf & 0x40))
654 return -1;
655 /* determine number of bytes in char */
656 len = 2;
657 for (mask=0x20; mask != 0; mask>>=1) {
658 if (!(*utf & mask))
659 return len;
660 len++;
661 }
662 return -1;
663 }
664
665 /**
666 * xmlUTF8Charcmp:
667 * @utf1: pointer to first UTF8 char
668 * @utf2: pointer to second UTF8 char
669 *
670 * compares the two UCS4 values
671 *
672 * returns result of the compare as with xmlStrncmp
673 */
674 int
xmlUTF8Charcmp(const xmlChar * utf1,const xmlChar * utf2)675 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
676
677 if (utf1 == NULL ) {
678 if (utf2 == NULL)
679 return 0;
680 return -1;
681 }
682 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
683 }
684
685 /**
686 * xmlUTF8Strlen:
687 * @utf: a sequence of UTF-8 encoded bytes
688 *
689 * compute the length of an UTF8 string, it doesn't do a full UTF8
690 * checking of the content of the string.
691 *
692 * Returns the number of characters in the string or -1 in case of error
693 */
694 int
xmlUTF8Strlen(const xmlChar * utf)695 xmlUTF8Strlen(const xmlChar *utf) {
696 int ret = 0;
697
698 if (utf == NULL)
699 return(-1);
700
701 while (*utf != 0) {
702 if (utf[0] & 0x80) {
703 if ((utf[1] & 0xc0) != 0x80)
704 return(-1);
705 if ((utf[0] & 0xe0) == 0xe0) {
706 if ((utf[2] & 0xc0) != 0x80)
707 return(-1);
708 if ((utf[0] & 0xf0) == 0xf0) {
709 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
710 return(-1);
711 utf += 4;
712 } else {
713 utf += 3;
714 }
715 } else {
716 utf += 2;
717 }
718 } else {
719 utf++;
720 }
721 ret++;
722 }
723 return(ret);
724 }
725
726 /**
727 * xmlGetUTF8Char:
728 * @utf: a sequence of UTF-8 encoded bytes
729 * @len: a pointer to the minimum number of bytes present in
730 * the sequence. This is used to assure the next character
731 * is completely contained within the sequence.
732 *
733 * Read the first UTF8 character from @utf
734 *
735 * Returns the char value or -1 in case of error, and sets *len to
736 * the actual number of bytes consumed (0 in case of error)
737 */
738 int
xmlGetUTF8Char(const unsigned char * utf,int * len)739 xmlGetUTF8Char(const unsigned char *utf, int *len) {
740 unsigned int c;
741
742 if (utf == NULL)
743 goto error;
744 if (len == NULL)
745 goto error;
746 if (*len < 1)
747 goto error;
748
749 c = utf[0];
750 if (c & 0x80) {
751 if (*len < 2)
752 goto error;
753 if ((utf[1] & 0xc0) != 0x80)
754 goto error;
755 if ((c & 0xe0) == 0xe0) {
756 if (*len < 3)
757 goto error;
758 if ((utf[2] & 0xc0) != 0x80)
759 goto error;
760 if ((c & 0xf0) == 0xf0) {
761 if (*len < 4)
762 goto error;
763 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
764 goto error;
765 *len = 4;
766 /* 4-byte code */
767 c = (utf[0] & 0x7) << 18;
768 c |= (utf[1] & 0x3f) << 12;
769 c |= (utf[2] & 0x3f) << 6;
770 c |= utf[3] & 0x3f;
771 } else {
772 /* 3-byte code */
773 *len = 3;
774 c = (utf[0] & 0xf) << 12;
775 c |= (utf[1] & 0x3f) << 6;
776 c |= utf[2] & 0x3f;
777 }
778 } else {
779 /* 2-byte code */
780 *len = 2;
781 c = (utf[0] & 0x1f) << 6;
782 c |= utf[1] & 0x3f;
783 }
784 } else {
785 /* 1-byte code */
786 *len = 1;
787 }
788 return(c);
789
790 error:
791 if (len != NULL)
792 *len = 0;
793 return(-1);
794 }
795
796 /**
797 * xmlCheckUTF8:
798 * @utf: Pointer to putative UTF-8 encoded string.
799 *
800 * Checks @utf for being valid UTF-8. @utf is assumed to be
801 * null-terminated. This function is not super-strict, as it will
802 * allow longer UTF-8 sequences than necessary. Note that Java is
803 * capable of producing these sequences if provoked. Also note, this
804 * routine checks for the 4-byte maximum size, but does not check for
805 * 0x10ffff maximum value.
806 *
807 * Return value: true if @utf is valid.
808 **/
809 int
xmlCheckUTF8(const unsigned char * utf)810 xmlCheckUTF8(const unsigned char *utf)
811 {
812 int ix;
813 unsigned char c;
814
815 if (utf == NULL)
816 return(0);
817 /*
818 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
819 * are as follows (in "bit format"):
820 * 0xxxxxxx valid 1-byte
821 * 110xxxxx 10xxxxxx valid 2-byte
822 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
823 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
824 */
825 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
826 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
827 ix++;
828 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
829 if ((utf[ix+1] & 0xc0 ) != 0x80)
830 return 0;
831 ix += 2;
832 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
833 if (((utf[ix+1] & 0xc0) != 0x80) ||
834 ((utf[ix+2] & 0xc0) != 0x80))
835 return 0;
836 ix += 3;
837 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
838 if (((utf[ix+1] & 0xc0) != 0x80) ||
839 ((utf[ix+2] & 0xc0) != 0x80) ||
840 ((utf[ix+3] & 0xc0) != 0x80))
841 return 0;
842 ix += 4;
843 } else /* unknown encoding */
844 return 0;
845 }
846 return(1);
847 }
848
849 /**
850 * xmlUTF8Strsize:
851 * @utf: a sequence of UTF-8 encoded bytes
852 * @len: the number of characters in the array
853 *
854 * storage size of an UTF8 string
855 * the behaviour is not guaranteed if the input string is not UTF-8
856 *
857 * Returns the storage size of
858 * the first 'len' characters of ARRAY
859 */
860
861 int
xmlUTF8Strsize(const xmlChar * utf,int len)862 xmlUTF8Strsize(const xmlChar *utf, int len) {
863 const xmlChar *ptr=utf;
864 xmlChar ch;
865
866 if (utf == NULL)
867 return(0);
868
869 if (len <= 0)
870 return(0);
871
872 while ( len-- > 0) {
873 if ( !*ptr )
874 break;
875 if ( (ch = *ptr++) & 0x80)
876 while ((ch<<=1) & 0x80 ) {
877 if (*ptr == 0) break;
878 ptr++;
879 }
880 }
881 return (ptr - utf);
882 }
883
884
885 /**
886 * xmlUTF8Strndup:
887 * @utf: the input UTF8 *
888 * @len: the len of @utf (in chars)
889 *
890 * a strndup for array of UTF8's
891 *
892 * Returns a new UTF8 * or NULL
893 */
894 xmlChar *
xmlUTF8Strndup(const xmlChar * utf,int len)895 xmlUTF8Strndup(const xmlChar *utf, int len) {
896 xmlChar *ret;
897 int i;
898
899 if ((utf == NULL) || (len < 0)) return(NULL);
900 i = xmlUTF8Strsize(utf, len);
901 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
902 if (ret == NULL) {
903 xmlGenericError(xmlGenericErrorContext,
904 "malloc of %ld byte failed\n",
905 (len + 1) * (long)sizeof(xmlChar));
906 return(NULL);
907 }
908 memcpy(ret, utf, i * sizeof(xmlChar));
909 ret[i] = 0;
910 return(ret);
911 }
912
913 /**
914 * xmlUTF8Strpos:
915 * @utf: the input UTF8 *
916 * @pos: the position of the desired UTF8 char (in chars)
917 *
918 * a function to provide the equivalent of fetching a
919 * character from a string array
920 *
921 * Returns a pointer to the UTF8 character or NULL
922 */
923 const xmlChar *
xmlUTF8Strpos(const xmlChar * utf,int pos)924 xmlUTF8Strpos(const xmlChar *utf, int pos) {
925 xmlChar ch;
926
927 if (utf == NULL) return(NULL);
928 if (pos < 0)
929 return(NULL);
930 while (pos--) {
931 if ((ch=*utf++) == 0) return(NULL);
932 if ( ch & 0x80 ) {
933 /* if not simple ascii, verify proper format */
934 if ( (ch & 0xc0) != 0xc0 )
935 return(NULL);
936 /* then skip over remaining bytes for this char */
937 while ( (ch <<= 1) & 0x80 )
938 if ( (*utf++ & 0xc0) != 0x80 )
939 return(NULL);
940 }
941 }
942 return((xmlChar *)utf);
943 }
944
945 /**
946 * xmlUTF8Strloc:
947 * @utf: the input UTF8 *
948 * @utfchar: the UTF8 character to be found
949 *
950 * a function to provide the relative location of a UTF8 char
951 *
952 * Returns the relative character position of the desired char
953 * or -1 if not found
954 */
955 int
xmlUTF8Strloc(const xmlChar * utf,const xmlChar * utfchar)956 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
957 int i, size;
958 xmlChar ch;
959
960 if (utf==NULL || utfchar==NULL) return -1;
961 size = xmlUTF8Strsize(utfchar, 1);
962 for(i=0; (ch=*utf) != 0; i++) {
963 if (xmlStrncmp(utf, utfchar, size)==0)
964 return(i);
965 utf++;
966 if ( ch & 0x80 ) {
967 /* if not simple ascii, verify proper format */
968 if ( (ch & 0xc0) != 0xc0 )
969 return(-1);
970 /* then skip over remaining bytes for this char */
971 while ( (ch <<= 1) & 0x80 )
972 if ( (*utf++ & 0xc0) != 0x80 )
973 return(-1);
974 }
975 }
976
977 return(-1);
978 }
979 /**
980 * xmlUTF8Strsub:
981 * @utf: a sequence of UTF-8 encoded bytes
982 * @start: relative pos of first char
983 * @len: total number to copy
984 *
985 * Create a substring from a given UTF-8 string
986 * Note: positions are given in units of UTF-8 chars
987 *
988 * Returns a pointer to a newly created string
989 * or NULL if any problem
990 */
991
992 xmlChar *
xmlUTF8Strsub(const xmlChar * utf,int start,int len)993 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
994 int i;
995 xmlChar ch;
996
997 if (utf == NULL) return(NULL);
998 if (start < 0) return(NULL);
999 if (len < 0) return(NULL);
1000
1001 /*
1002 * Skip over any leading chars
1003 */
1004 for (i = 0;i < start;i++) {
1005 if ((ch=*utf++) == 0) return(NULL);
1006 if ( ch & 0x80 ) {
1007 /* if not simple ascii, verify proper format */
1008 if ( (ch & 0xc0) != 0xc0 )
1009 return(NULL);
1010 /* then skip over remaining bytes for this char */
1011 while ( (ch <<= 1) & 0x80 )
1012 if ( (*utf++ & 0xc0) != 0x80 )
1013 return(NULL);
1014 }
1015 }
1016
1017 return(xmlUTF8Strndup(utf, len));
1018 }
1019
1020 /**
1021 * xmlEscapeFormatString:
1022 * @msg: a pointer to the string in which to escape '%' characters.
1023 * Must be a heap-allocated buffer created by libxml2 that may be
1024 * returned, or that may be freed and replaced.
1025 *
1026 * Replaces the string pointed to by 'msg' with an escaped string.
1027 * Returns the same string with all '%' characters escaped.
1028 */
1029 xmlChar *
xmlEscapeFormatString(xmlChar ** msg)1030 xmlEscapeFormatString(xmlChar **msg)
1031 {
1032 xmlChar *msgPtr = NULL;
1033 xmlChar *result = NULL;
1034 xmlChar *resultPtr = NULL;
1035 size_t count = 0;
1036 size_t msgLen = 0;
1037 size_t resultLen = 0;
1038
1039 if (!msg || !*msg)
1040 return(NULL);
1041
1042 for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1043 ++msgLen;
1044 if (*msgPtr == '%')
1045 ++count;
1046 }
1047
1048 if (count == 0)
1049 return(*msg);
1050
1051 resultLen = msgLen + count + 1;
1052 result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1053 if (result == NULL) {
1054 /* Clear *msg to prevent format string vulnerabilities in
1055 out-of-memory situations. */
1056 xmlFree(*msg);
1057 *msg = NULL;
1058 xmlErrMemory(NULL, NULL);
1059 return(NULL);
1060 }
1061
1062 for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1063 *resultPtr = *msgPtr;
1064 if (*msgPtr == '%')
1065 *(++resultPtr) = '%';
1066 }
1067 result[resultLen - 1] = '\0';
1068
1069 xmlFree(*msg);
1070 *msg = result;
1071
1072 return *msg;
1073 }
1074
1075 #define bottom_xmlstring
1076 #include "elfgcchack.h"
1077