1 /* 2 * string.c : an XML string utilities module 3 * 4 * This module provides various utility functions for manipulating 5 * the xmlChar* type. All functions named xmlStr* have been moved here 6 * from the parser.c file (their original home). 7 * 8 * See Copyright for the status of this software. 9 * 10 * UTF8 string routines from: 11 * William Brack <wbrack@mmm.com.hk> 12 * 13 * daniel@veillard.com 14 */ 15 16 #define IN_LIBXML 17 #include "libxml.h" 18 19 #include <stdlib.h> 20 #include <string.h> 21 #include <limits.h> 22 #include <libxml/xmlmemory.h> 23 #include <libxml/parserInternals.h> 24 #include <libxml/xmlstring.h> 25 26 /************************************************************************ 27 * * 28 * Commodity functions to handle xmlChars * 29 * * 30 ************************************************************************/ 31 32 /** 33 * xmlStrndup: 34 * @cur: the input xmlChar * 35 * @len: the len of @cur 36 * 37 * a strndup for array of xmlChar's 38 * 39 * Returns a new xmlChar * or NULL 40 */ 41 xmlChar * 42 xmlStrndup(const xmlChar *cur, int len) { 43 xmlChar *ret; 44 45 if ((cur == NULL) || (len < 0)) return(NULL); 46 ret = (xmlChar *) xmlMallocAtomic(((size_t) len + 1) * sizeof(xmlChar)); 47 if (ret == NULL) { 48 xmlErrMemory(NULL, NULL); 49 return(NULL); 50 } 51 memcpy(ret, cur, len * sizeof(xmlChar)); 52 ret[len] = 0; 53 return(ret); 54 } 55 56 /** 57 * xmlStrdup: 58 * @cur: the input xmlChar * 59 * 60 * a strdup for array of xmlChar's. Since they are supposed to be 61 * encoded in UTF-8 or an encoding with 8bit based chars, we assume 62 * a termination mark of '0'. 63 * 64 * Returns a new xmlChar * or NULL 65 */ 66 xmlChar * 67 xmlStrdup(const xmlChar *cur) { 68 const xmlChar *p = cur; 69 70 if (cur == NULL) return(NULL); 71 while (*p != 0) p++; /* non input consuming */ 72 return(xmlStrndup(cur, p - cur)); 73 } 74 75 /** 76 * xmlCharStrndup: 77 * @cur: the input char * 78 * @len: the len of @cur 79 * 80 * a strndup for char's to xmlChar's 81 * 82 * Returns a new xmlChar * or NULL 83 */ 84 85 xmlChar * 86 xmlCharStrndup(const char *cur, int len) { 87 int i; 88 xmlChar *ret; 89 90 if ((cur == NULL) || (len < 0)) return(NULL); 91 ret = (xmlChar *) xmlMallocAtomic(((size_t) len + 1) * sizeof(xmlChar)); 92 if (ret == NULL) { 93 xmlErrMemory(NULL, NULL); 94 return(NULL); 95 } 96 for (i = 0;i < len;i++) { 97 ret[i] = (xmlChar) cur[i]; 98 if (ret[i] == 0) return(ret); 99 } 100 ret[len] = 0; 101 return(ret); 102 } 103 104 /** 105 * xmlCharStrdup: 106 * @cur: the input char * 107 * 108 * a strdup for char's to xmlChar's 109 * 110 * Returns a new xmlChar * or NULL 111 */ 112 113 xmlChar * 114 xmlCharStrdup(const char *cur) { 115 const char *p = cur; 116 117 if (cur == NULL) return(NULL); 118 while (*p != '\0') p++; /* non input consuming */ 119 return(xmlCharStrndup(cur, p - cur)); 120 } 121 122 /** 123 * xmlStrcmp: 124 * @str1: the first xmlChar * 125 * @str2: the second xmlChar * 126 * 127 * a strcmp for xmlChar's 128 * 129 * Returns the integer result of the comparison 130 */ 131 132 int 133 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) { 134 if (str1 == str2) return(0); 135 if (str1 == NULL) return(-1); 136 if (str2 == NULL) return(1); 137 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 138 return(strcmp((const char *)str1, (const char *)str2)); 139 #else 140 do { 141 int tmp = *str1++ - *str2; 142 if (tmp != 0) return(tmp); 143 } while (*str2++ != 0); 144 return 0; 145 #endif 146 } 147 148 /** 149 * xmlStrEqual: 150 * @str1: the first xmlChar * 151 * @str2: the second xmlChar * 152 * 153 * Check if both strings are equal of have same content. 154 * Should be a bit more readable and faster than xmlStrcmp() 155 * 156 * Returns 1 if they are equal, 0 if they are different 157 */ 158 159 int 160 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) { 161 if (str1 == str2) return(1); 162 if (str1 == NULL) return(0); 163 if (str2 == NULL) return(0); 164 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 165 return(strcmp((const char *)str1, (const char *)str2) == 0); 166 #else 167 do { 168 if (*str1++ != *str2) return(0); 169 } while (*str2++); 170 return(1); 171 #endif 172 } 173 174 /** 175 * xmlStrQEqual: 176 * @pref: the prefix of the QName 177 * @name: the localname of the QName 178 * @str: the second xmlChar * 179 * 180 * Check if a QName is Equal to a given string 181 * 182 * Returns 1 if they are equal, 0 if they are different 183 */ 184 185 int 186 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) { 187 if (pref == NULL) return(xmlStrEqual(name, str)); 188 if (name == NULL) return(0); 189 if (str == NULL) return(0); 190 191 do { 192 if (*pref++ != *str) return(0); 193 } while ((*str++) && (*pref)); 194 if (*str++ != ':') return(0); 195 do { 196 if (*name++ != *str) return(0); 197 } while (*str++); 198 return(1); 199 } 200 201 /** 202 * xmlStrncmp: 203 * @str1: the first xmlChar * 204 * @str2: the second xmlChar * 205 * @len: the max comparison length 206 * 207 * a strncmp for xmlChar's 208 * 209 * Returns the integer result of the comparison 210 */ 211 212 int 213 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) { 214 if (len <= 0) return(0); 215 if (str1 == str2) return(0); 216 if (str1 == NULL) return(-1); 217 if (str2 == NULL) return(1); 218 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 219 return(strncmp((const char *)str1, (const char *)str2, len)); 220 #else 221 do { 222 int tmp = *str1++ - *str2; 223 if (tmp != 0 || --len == 0) return(tmp); 224 } while (*str2++ != 0); 225 return 0; 226 #endif 227 } 228 229 static const xmlChar casemap[256] = { 230 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 231 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, 232 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 233 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, 234 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27, 235 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, 236 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 237 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, 238 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 239 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, 240 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 241 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F, 242 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 243 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, 244 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 245 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F, 246 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 247 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F, 248 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 249 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F, 250 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7, 251 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF, 252 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7, 253 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF, 254 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7, 255 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF, 256 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7, 257 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, 258 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7, 259 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, 260 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7, 261 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF 262 }; 263 264 /** 265 * xmlStrcasecmp: 266 * @str1: the first xmlChar * 267 * @str2: the second xmlChar * 268 * 269 * a strcasecmp for xmlChar's 270 * 271 * Returns the integer result of the comparison 272 */ 273 274 int 275 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) { 276 register int tmp; 277 278 if (str1 == str2) return(0); 279 if (str1 == NULL) return(-1); 280 if (str2 == NULL) return(1); 281 do { 282 tmp = casemap[*str1++] - casemap[*str2]; 283 if (tmp != 0) return(tmp); 284 } while (*str2++ != 0); 285 return 0; 286 } 287 288 /** 289 * xmlStrncasecmp: 290 * @str1: the first xmlChar * 291 * @str2: the second xmlChar * 292 * @len: the max comparison length 293 * 294 * a strncasecmp for xmlChar's 295 * 296 * Returns the integer result of the comparison 297 */ 298 299 int 300 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) { 301 register int tmp; 302 303 if (len <= 0) return(0); 304 if (str1 == str2) return(0); 305 if (str1 == NULL) return(-1); 306 if (str2 == NULL) return(1); 307 do { 308 tmp = casemap[*str1++] - casemap[*str2]; 309 if (tmp != 0 || --len == 0) return(tmp); 310 } while (*str2++ != 0); 311 return 0; 312 } 313 314 /** 315 * xmlStrchr: 316 * @str: the xmlChar * array 317 * @val: the xmlChar to search 318 * 319 * a strchr for xmlChar's 320 * 321 * Returns the xmlChar * for the first occurrence or NULL. 322 */ 323 324 const xmlChar * 325 xmlStrchr(const xmlChar *str, xmlChar val) { 326 if (str == NULL) return(NULL); 327 while (*str != 0) { /* non input consuming */ 328 if (*str == val) return((xmlChar *) str); 329 str++; 330 } 331 return(NULL); 332 } 333 334 /** 335 * xmlStrstr: 336 * @str: the xmlChar * array (haystack) 337 * @val: the xmlChar to search (needle) 338 * 339 * a strstr for xmlChar's 340 * 341 * Returns the xmlChar * for the first occurrence or NULL. 342 */ 343 344 const xmlChar * 345 xmlStrstr(const xmlChar *str, const xmlChar *val) { 346 int n; 347 348 if (str == NULL) return(NULL); 349 if (val == NULL) return(NULL); 350 n = xmlStrlen(val); 351 352 if (n == 0) return(str); 353 while (*str != 0) { /* non input consuming */ 354 if (*str == *val) { 355 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str); 356 } 357 str++; 358 } 359 return(NULL); 360 } 361 362 /** 363 * xmlStrcasestr: 364 * @str: the xmlChar * array (haystack) 365 * @val: the xmlChar to search (needle) 366 * 367 * a case-ignoring strstr for xmlChar's 368 * 369 * Returns the xmlChar * for the first occurrence or NULL. 370 */ 371 372 const xmlChar * 373 xmlStrcasestr(const xmlChar *str, const xmlChar *val) { 374 int n; 375 376 if (str == NULL) return(NULL); 377 if (val == NULL) return(NULL); 378 n = xmlStrlen(val); 379 380 if (n == 0) return(str); 381 while (*str != 0) { /* non input consuming */ 382 if (casemap[*str] == casemap[*val]) 383 if (!xmlStrncasecmp(str, val, n)) return(str); 384 str++; 385 } 386 return(NULL); 387 } 388 389 /** 390 * xmlStrsub: 391 * @str: the xmlChar * array (haystack) 392 * @start: the index of the first char (zero based) 393 * @len: the length of the substring 394 * 395 * Extract a substring of a given string 396 * 397 * Returns the xmlChar * for the first occurrence or NULL. 398 */ 399 400 xmlChar * 401 xmlStrsub(const xmlChar *str, int start, int len) { 402 int i; 403 404 if (str == NULL) return(NULL); 405 if (start < 0) return(NULL); 406 if (len < 0) return(NULL); 407 408 for (i = 0;i < start;i++) { 409 if (*str == 0) return(NULL); 410 str++; 411 } 412 if (*str == 0) return(NULL); 413 return(xmlStrndup(str, len)); 414 } 415 416 /** 417 * xmlStrlen: 418 * @str: the xmlChar * array 419 * 420 * length of a xmlChar's string 421 * 422 * Returns the number of xmlChar contained in the ARRAY. 423 */ 424 425 int 426 xmlStrlen(const xmlChar *str) { 427 size_t len = str ? strlen((const char *)str) : 0; 428 return(len > INT_MAX ? 0 : len); 429 } 430 431 /** 432 * xmlStrncat: 433 * @cur: the original xmlChar * array 434 * @add: the xmlChar * array added 435 * @len: the length of @add 436 * 437 * a strncat for array of xmlChar's, it will extend @cur with the len 438 * first bytes of @add. Note that if @len < 0 then this is an API error 439 * and NULL will be returned. 440 * 441 * Returns a new xmlChar *, the original @cur is reallocated and should 442 * not be freed. 443 */ 444 445 xmlChar * 446 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) { 447 int size; 448 xmlChar *ret; 449 450 if ((add == NULL) || (len == 0)) 451 return(cur); 452 if (len < 0) 453 return(NULL); 454 if (cur == NULL) 455 return(xmlStrndup(add, len)); 456 457 size = xmlStrlen(cur); 458 if ((size < 0) || (size > INT_MAX - len)) 459 return(NULL); 460 ret = (xmlChar *) xmlRealloc(cur, ((size_t) size + len + 1) * sizeof(xmlChar)); 461 if (ret == NULL) { 462 xmlErrMemory(NULL, NULL); 463 return(cur); 464 } 465 memcpy(&ret[size], add, len * sizeof(xmlChar)); 466 ret[size + len] = 0; 467 return(ret); 468 } 469 470 /** 471 * xmlStrncatNew: 472 * @str1: first xmlChar string 473 * @str2: second xmlChar string 474 * @len: the len of @str2 or < 0 475 * 476 * same as xmlStrncat, but creates a new string. The original 477 * two strings are not freed. If @len is < 0 then the length 478 * will be calculated automatically. 479 * 480 * Returns a new xmlChar * or NULL 481 */ 482 xmlChar * 483 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) { 484 int size; 485 xmlChar *ret; 486 487 if (len < 0) { 488 len = xmlStrlen(str2); 489 if (len < 0) 490 return(NULL); 491 } 492 if ((str2 == NULL) || (len == 0)) 493 return(xmlStrdup(str1)); 494 if (str1 == NULL) 495 return(xmlStrndup(str2, len)); 496 497 size = xmlStrlen(str1); 498 if ((size < 0) || (size > INT_MAX - len)) 499 return(NULL); 500 ret = (xmlChar *) xmlMalloc(((size_t) size + len + 1) * sizeof(xmlChar)); 501 if (ret == NULL) { 502 xmlErrMemory(NULL, NULL); 503 return(xmlStrndup(str1, size)); 504 } 505 memcpy(ret, str1, size * sizeof(xmlChar)); 506 memcpy(&ret[size], str2, len * sizeof(xmlChar)); 507 ret[size + len] = 0; 508 return(ret); 509 } 510 511 /** 512 * xmlStrcat: 513 * @cur: the original xmlChar * array 514 * @add: the xmlChar * array added 515 * 516 * a strcat for array of xmlChar's. Since they are supposed to be 517 * encoded in UTF-8 or an encoding with 8bit based chars, we assume 518 * a termination mark of '0'. 519 * 520 * Returns a new xmlChar * containing the concatenated string. The original 521 * @cur is reallocated and should not be freed. 522 */ 523 xmlChar * 524 xmlStrcat(xmlChar *cur, const xmlChar *add) { 525 const xmlChar *p = add; 526 527 if (add == NULL) return(cur); 528 if (cur == NULL) 529 return(xmlStrdup(add)); 530 531 while (*p != 0) p++; /* non input consuming */ 532 return(xmlStrncat(cur, add, p - add)); 533 } 534 535 /** 536 * xmlStrPrintf: 537 * @buf: the result buffer. 538 * @len: the result buffer length. 539 * @msg: the message with printf formatting. 540 * @...: extra parameters for the message. 541 * 542 * Formats @msg and places result into @buf. 543 * 544 * Returns the number of characters written to @buf or -1 if an error occurs. 545 */ 546 int XMLCDECL 547 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) { 548 va_list args; 549 int ret; 550 551 if((buf == NULL) || (msg == NULL)) { 552 return(-1); 553 } 554 555 va_start(args, msg); 556 ret = vsnprintf((char *) buf, len, (const char *) msg, args); 557 va_end(args); 558 buf[len - 1] = 0; /* be safe ! */ 559 560 return(ret); 561 } 562 563 /** 564 * xmlStrVPrintf: 565 * @buf: the result buffer. 566 * @len: the result buffer length. 567 * @msg: the message with printf formatting. 568 * @ap: extra parameters for the message. 569 * 570 * Formats @msg and places result into @buf. 571 * 572 * Returns the number of characters written to @buf or -1 if an error occurs. 573 */ 574 int 575 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) { 576 int ret; 577 578 if((buf == NULL) || (msg == NULL)) { 579 return(-1); 580 } 581 582 ret = vsnprintf((char *) buf, len, (const char *) msg, ap); 583 buf[len - 1] = 0; /* be safe ! */ 584 585 return(ret); 586 } 587 588 /************************************************************************ 589 * * 590 * Generic UTF8 handling routines * 591 * * 592 * From rfc2044: encoding of the Unicode values on UTF-8: * 593 * * 594 * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 595 * 0000 0000-0000 007F 0xxxxxxx * 596 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 597 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * 598 * * 599 * I hope we won't use values > 0xFFFF anytime soon ! * 600 * * 601 ************************************************************************/ 602 603 604 /** 605 * xmlUTF8Size: 606 * @utf: pointer to the UTF8 character 607 * 608 * calculates the internal size of a UTF8 character 609 * 610 * returns the numbers of bytes in the character, -1 on format error 611 */ 612 int 613 xmlUTF8Size(const xmlChar *utf) { 614 xmlChar mask; 615 int len; 616 617 if (utf == NULL) 618 return -1; 619 if (*utf < 0x80) 620 return 1; 621 /* check valid UTF8 character */ 622 if (!(*utf & 0x40)) 623 return -1; 624 /* determine number of bytes in char */ 625 len = 2; 626 for (mask=0x20; mask != 0; mask>>=1) { 627 if (!(*utf & mask)) 628 return len; 629 len++; 630 } 631 return -1; 632 } 633 634 /** 635 * xmlUTF8Charcmp: 636 * @utf1: pointer to first UTF8 char 637 * @utf2: pointer to second UTF8 char 638 * 639 * compares the two UCS4 values 640 * 641 * returns result of the compare as with xmlStrncmp 642 */ 643 int 644 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) { 645 646 if (utf1 == NULL ) { 647 if (utf2 == NULL) 648 return 0; 649 return -1; 650 } 651 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1)); 652 } 653 654 /** 655 * xmlUTF8Strlen: 656 * @utf: a sequence of UTF-8 encoded bytes 657 * 658 * compute the length of an UTF8 string, it doesn't do a full UTF8 659 * checking of the content of the string. 660 * 661 * Returns the number of characters in the string or -1 in case of error 662 */ 663 int 664 xmlUTF8Strlen(const xmlChar *utf) { 665 size_t ret = 0; 666 667 if (utf == NULL) 668 return(-1); 669 670 while (*utf != 0) { 671 if (utf[0] & 0x80) { 672 if ((utf[1] & 0xc0) != 0x80) 673 return(-1); 674 if ((utf[0] & 0xe0) == 0xe0) { 675 if ((utf[2] & 0xc0) != 0x80) 676 return(-1); 677 if ((utf[0] & 0xf0) == 0xf0) { 678 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) 679 return(-1); 680 utf += 4; 681 } else { 682 utf += 3; 683 } 684 } else { 685 utf += 2; 686 } 687 } else { 688 utf++; 689 } 690 ret++; 691 } 692 return(ret > INT_MAX ? 0 : ret); 693 } 694 695 /** 696 * xmlGetUTF8Char: 697 * @utf: a sequence of UTF-8 encoded bytes 698 * @len: a pointer to the minimum number of bytes present in 699 * the sequence. This is used to assure the next character 700 * is completely contained within the sequence. 701 * 702 * Read the first UTF8 character from @utf 703 * 704 * Returns the char value or -1 in case of error, and sets *len to 705 * the actual number of bytes consumed (0 in case of error) 706 */ 707 int 708 xmlGetUTF8Char(const unsigned char *utf, int *len) { 709 unsigned int c; 710 711 if (utf == NULL) 712 goto error; 713 if (len == NULL) 714 goto error; 715 if (*len < 1) 716 goto error; 717 718 c = utf[0]; 719 if (c & 0x80) { 720 if (*len < 2) 721 goto error; 722 if ((utf[1] & 0xc0) != 0x80) 723 goto error; 724 if ((c & 0xe0) == 0xe0) { 725 if (*len < 3) 726 goto error; 727 if ((utf[2] & 0xc0) != 0x80) 728 goto error; 729 if ((c & 0xf0) == 0xf0) { 730 if (*len < 4) 731 goto error; 732 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) 733 goto error; 734 *len = 4; 735 /* 4-byte code */ 736 c = (utf[0] & 0x7) << 18; 737 c |= (utf[1] & 0x3f) << 12; 738 c |= (utf[2] & 0x3f) << 6; 739 c |= utf[3] & 0x3f; 740 } else { 741 /* 3-byte code */ 742 *len = 3; 743 c = (utf[0] & 0xf) << 12; 744 c |= (utf[1] & 0x3f) << 6; 745 c |= utf[2] & 0x3f; 746 } 747 } else { 748 /* 2-byte code */ 749 *len = 2; 750 c = (utf[0] & 0x1f) << 6; 751 c |= utf[1] & 0x3f; 752 } 753 } else { 754 /* 1-byte code */ 755 *len = 1; 756 } 757 return(c); 758 759 error: 760 if (len != NULL) 761 *len = 0; 762 return(-1); 763 } 764 765 /** 766 * xmlCheckUTF8: 767 * @utf: Pointer to putative UTF-8 encoded string. 768 * 769 * Checks @utf for being valid UTF-8. @utf is assumed to be 770 * null-terminated. This function is not super-strict, as it will 771 * allow longer UTF-8 sequences than necessary. Note that Java is 772 * capable of producing these sequences if provoked. Also note, this 773 * routine checks for the 4-byte maximum size, but does not check for 774 * 0x10ffff maximum value. 775 * 776 * Return value: true if @utf is valid. 777 **/ 778 int 779 xmlCheckUTF8(const unsigned char *utf) 780 { 781 int ix; 782 unsigned char c; 783 784 if (utf == NULL) 785 return(0); 786 /* 787 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings 788 * are as follows (in "bit format"): 789 * 0xxxxxxx valid 1-byte 790 * 110xxxxx 10xxxxxx valid 2-byte 791 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte 792 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte 793 */ 794 while ((c = utf[0])) { /* string is 0-terminated */ 795 ix = 0; 796 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */ 797 ix = 1; 798 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */ 799 if ((utf[1] & 0xc0 ) != 0x80) 800 return 0; 801 ix = 2; 802 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */ 803 if (((utf[1] & 0xc0) != 0x80) || 804 ((utf[2] & 0xc0) != 0x80)) 805 return 0; 806 ix = 3; 807 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */ 808 if (((utf[1] & 0xc0) != 0x80) || 809 ((utf[2] & 0xc0) != 0x80) || 810 ((utf[3] & 0xc0) != 0x80)) 811 return 0; 812 ix = 4; 813 } else /* unknown encoding */ 814 return 0; 815 utf += ix; 816 } 817 return(1); 818 } 819 820 /** 821 * xmlUTF8Strsize: 822 * @utf: a sequence of UTF-8 encoded bytes 823 * @len: the number of characters in the array 824 * 825 * storage size of an UTF8 string 826 * the behaviour is not guaranteed if the input string is not UTF-8 827 * 828 * Returns the storage size of 829 * the first 'len' characters of ARRAY 830 */ 831 832 int 833 xmlUTF8Strsize(const xmlChar *utf, int len) { 834 const xmlChar *ptr=utf; 835 int ch; 836 size_t ret; 837 838 if (utf == NULL) 839 return(0); 840 841 if (len <= 0) 842 return(0); 843 844 while ( len-- > 0) { 845 if ( !*ptr ) 846 break; 847 if ( (ch = *ptr++) & 0x80) 848 while ((ch<<=1) & 0x80 ) { 849 if (*ptr == 0) break; 850 ptr++; 851 } 852 } 853 ret = ptr - utf; 854 return (ret > INT_MAX ? 0 : ret); 855 } 856 857 858 /** 859 * xmlUTF8Strndup: 860 * @utf: the input UTF8 * 861 * @len: the len of @utf (in chars) 862 * 863 * a strndup for array of UTF8's 864 * 865 * Returns a new UTF8 * or NULL 866 */ 867 xmlChar * 868 xmlUTF8Strndup(const xmlChar *utf, int len) { 869 xmlChar *ret; 870 int i; 871 872 if ((utf == NULL) || (len < 0)) return(NULL); 873 i = xmlUTF8Strsize(utf, len); 874 ret = (xmlChar *) xmlMallocAtomic(((size_t) i + 1) * sizeof(xmlChar)); 875 if (ret == NULL) { 876 return(NULL); 877 } 878 memcpy(ret, utf, i * sizeof(xmlChar)); 879 ret[i] = 0; 880 return(ret); 881 } 882 883 /** 884 * xmlUTF8Strpos: 885 * @utf: the input UTF8 * 886 * @pos: the position of the desired UTF8 char (in chars) 887 * 888 * a function to provide the equivalent of fetching a 889 * character from a string array 890 * 891 * Returns a pointer to the UTF8 character or NULL 892 */ 893 const xmlChar * 894 xmlUTF8Strpos(const xmlChar *utf, int pos) { 895 int ch; 896 897 if (utf == NULL) return(NULL); 898 if (pos < 0) 899 return(NULL); 900 while (pos--) { 901 if ((ch=*utf++) == 0) return(NULL); 902 if ( ch & 0x80 ) { 903 /* if not simple ascii, verify proper format */ 904 if ( (ch & 0xc0) != 0xc0 ) 905 return(NULL); 906 /* then skip over remaining bytes for this char */ 907 while ( (ch <<= 1) & 0x80 ) 908 if ( (*utf++ & 0xc0) != 0x80 ) 909 return(NULL); 910 } 911 } 912 return((xmlChar *)utf); 913 } 914 915 /** 916 * xmlUTF8Strloc: 917 * @utf: the input UTF8 * 918 * @utfchar: the UTF8 character to be found 919 * 920 * a function to provide the relative location of a UTF8 char 921 * 922 * Returns the relative character position of the desired char 923 * or -1 if not found 924 */ 925 int 926 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) { 927 size_t i; 928 int size; 929 int ch; 930 931 if (utf==NULL || utfchar==NULL) return -1; 932 size = xmlUTF8Strsize(utfchar, 1); 933 for(i=0; (ch=*utf) != 0; i++) { 934 if (xmlStrncmp(utf, utfchar, size)==0) 935 return(i > INT_MAX ? 0 : i); 936 utf++; 937 if ( ch & 0x80 ) { 938 /* if not simple ascii, verify proper format */ 939 if ( (ch & 0xc0) != 0xc0 ) 940 return(-1); 941 /* then skip over remaining bytes for this char */ 942 while ( (ch <<= 1) & 0x80 ) 943 if ( (*utf++ & 0xc0) != 0x80 ) 944 return(-1); 945 } 946 } 947 948 return(-1); 949 } 950 /** 951 * xmlUTF8Strsub: 952 * @utf: a sequence of UTF-8 encoded bytes 953 * @start: relative pos of first char 954 * @len: total number to copy 955 * 956 * Create a substring from a given UTF-8 string 957 * Note: positions are given in units of UTF-8 chars 958 * 959 * Returns a pointer to a newly created string 960 * or NULL if any problem 961 */ 962 963 xmlChar * 964 xmlUTF8Strsub(const xmlChar *utf, int start, int len) { 965 int i; 966 int ch; 967 968 if (utf == NULL) return(NULL); 969 if (start < 0) return(NULL); 970 if (len < 0) return(NULL); 971 972 /* 973 * Skip over any leading chars 974 */ 975 for (i = 0;i < start;i++) { 976 if ((ch=*utf++) == 0) return(NULL); 977 if ( ch & 0x80 ) { 978 /* if not simple ascii, verify proper format */ 979 if ( (ch & 0xc0) != 0xc0 ) 980 return(NULL); 981 /* then skip over remaining bytes for this char */ 982 while ( (ch <<= 1) & 0x80 ) 983 if ( (*utf++ & 0xc0) != 0x80 ) 984 return(NULL); 985 } 986 } 987 988 return(xmlUTF8Strndup(utf, len)); 989 } 990 991 /** 992 * xmlEscapeFormatString: 993 * @msg: a pointer to the string in which to escape '%' characters. 994 * Must be a heap-allocated buffer created by libxml2 that may be 995 * returned, or that may be freed and replaced. 996 * 997 * Replaces the string pointed to by 'msg' with an escaped string. 998 * Returns the same string with all '%' characters escaped. 999 */ 1000 xmlChar * 1001 xmlEscapeFormatString(xmlChar **msg) 1002 { 1003 xmlChar *msgPtr = NULL; 1004 xmlChar *result = NULL; 1005 xmlChar *resultPtr = NULL; 1006 size_t count = 0; 1007 size_t msgLen = 0; 1008 size_t resultLen = 0; 1009 1010 if (!msg || !*msg) 1011 return(NULL); 1012 1013 for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) { 1014 ++msgLen; 1015 if (*msgPtr == '%') 1016 ++count; 1017 } 1018 1019 if (count == 0) 1020 return(*msg); 1021 1022 if ((count > INT_MAX) || (msgLen > INT_MAX - count)) 1023 return(NULL); 1024 resultLen = msgLen + count + 1; 1025 result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar)); 1026 if (result == NULL) { 1027 /* Clear *msg to prevent format string vulnerabilities in 1028 out-of-memory situations. */ 1029 xmlFree(*msg); 1030 *msg = NULL; 1031 xmlErrMemory(NULL, NULL); 1032 return(NULL); 1033 } 1034 1035 for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) { 1036 *resultPtr = *msgPtr; 1037 if (*msgPtr == '%') 1038 *(++resultPtr) = '%'; 1039 } 1040 result[resultLen - 1] = '\0'; 1041 1042 xmlFree(*msg); 1043 *msg = result; 1044 1045 return *msg; 1046 } 1047 1048