1 /* 2 * string.c : an XML string utilities module 3 * 4 * This module provides various utility functions for manipulating 5 * the xmlChar* type. All functions named xmlStr* have been moved here 6 * from the parser.c file (their original home). 7 * 8 * See Copyright for the status of this software. 9 * 10 * UTF8 string routines from: 11 * William Brack <wbrack@mmm.com.hk> 12 * 13 * daniel@veillard.com 14 */ 15 16 #define IN_LIBXML 17 #include "libxml.h" 18 19 #include <stdlib.h> 20 #include <string.h> 21 #include <libxml/xmlmemory.h> 22 #include <libxml/parserInternals.h> 23 #include <libxml/xmlstring.h> 24 25 /************************************************************************ 26 * * 27 * Commodity functions to handle xmlChars * 28 * * 29 ************************************************************************/ 30 31 /** 32 * xmlStrndup: 33 * @cur: the input xmlChar * 34 * @len: the len of @cur 35 * 36 * a strndup for array of xmlChar's 37 * 38 * Returns a new xmlChar * or NULL 39 */ 40 xmlChar * 41 xmlStrndup(const xmlChar *cur, int len) { 42 xmlChar *ret; 43 44 if ((cur == NULL) || (len < 0)) return(NULL); 45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar)); 46 if (ret == NULL) { 47 xmlErrMemory(NULL, NULL); 48 return(NULL); 49 } 50 memcpy(ret, cur, len * sizeof(xmlChar)); 51 ret[len] = 0; 52 return(ret); 53 } 54 55 /** 56 * xmlStrdup: 57 * @cur: the input xmlChar * 58 * 59 * a strdup for array of xmlChar's. Since they are supposed to be 60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume 61 * a termination mark of '0'. 62 * 63 * Returns a new xmlChar * or NULL 64 */ 65 xmlChar * 66 xmlStrdup(const xmlChar *cur) { 67 const xmlChar *p = cur; 68 69 if (cur == NULL) return(NULL); 70 while (*p != 0) p++; /* non input consuming */ 71 return(xmlStrndup(cur, p - cur)); 72 } 73 74 /** 75 * xmlCharStrndup: 76 * @cur: the input char * 77 * @len: the len of @cur 78 * 79 * a strndup for char's to xmlChar's 80 * 81 * Returns a new xmlChar * or NULL 82 */ 83 84 xmlChar * 85 xmlCharStrndup(const char *cur, int len) { 86 int i; 87 xmlChar *ret; 88 89 if ((cur == NULL) || (len < 0)) return(NULL); 90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar)); 91 if (ret == NULL) { 92 xmlErrMemory(NULL, NULL); 93 return(NULL); 94 } 95 for (i = 0;i < len;i++) { 96 ret[i] = (xmlChar) cur[i]; 97 if (ret[i] == 0) return(ret); 98 } 99 ret[len] = 0; 100 return(ret); 101 } 102 103 /** 104 * xmlCharStrdup: 105 * @cur: the input char * 106 * 107 * a strdup for char's to xmlChar's 108 * 109 * Returns a new xmlChar * or NULL 110 */ 111 112 xmlChar * 113 xmlCharStrdup(const char *cur) { 114 const char *p = cur; 115 116 if (cur == NULL) return(NULL); 117 while (*p != '\0') p++; /* non input consuming */ 118 return(xmlCharStrndup(cur, p - cur)); 119 } 120 121 /** 122 * xmlStrcmp: 123 * @str1: the first xmlChar * 124 * @str2: the second xmlChar * 125 * 126 * a strcmp for xmlChar's 127 * 128 * Returns the integer result of the comparison 129 */ 130 131 int 132 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) { 133 register int tmp; 134 135 if (str1 == str2) return(0); 136 if (str1 == NULL) return(-1); 137 if (str2 == NULL) return(1); 138 do { 139 tmp = *str1++ - *str2; 140 if (tmp != 0) return(tmp); 141 } while (*str2++ != 0); 142 return 0; 143 } 144 145 /** 146 * xmlStrEqual: 147 * @str1: the first xmlChar * 148 * @str2: the second xmlChar * 149 * 150 * Check if both strings are equal of have same content. 151 * Should be a bit more readable and faster than xmlStrcmp() 152 * 153 * Returns 1 if they are equal, 0 if they are different 154 */ 155 156 int 157 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) { 158 if (str1 == str2) return(1); 159 if (str1 == NULL) return(0); 160 if (str2 == NULL) return(0); 161 do { 162 if (*str1++ != *str2) return(0); 163 } while (*str2++); 164 return(1); 165 } 166 167 /** 168 * xmlStrQEqual: 169 * @pref: the prefix of the QName 170 * @name: the localname of the QName 171 * @str: the second xmlChar * 172 * 173 * Check if a QName is Equal to a given string 174 * 175 * Returns 1 if they are equal, 0 if they are different 176 */ 177 178 int 179 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) { 180 if (pref == NULL) return(xmlStrEqual(name, str)); 181 if (name == NULL) return(0); 182 if (str == NULL) return(0); 183 184 do { 185 if (*pref++ != *str) return(0); 186 } while ((*str++) && (*pref)); 187 if (*str++ != ':') return(0); 188 do { 189 if (*name++ != *str) return(0); 190 } while (*str++); 191 return(1); 192 } 193 194 /** 195 * xmlStrncmp: 196 * @str1: the first xmlChar * 197 * @str2: the second xmlChar * 198 * @len: the max comparison length 199 * 200 * a strncmp for xmlChar's 201 * 202 * Returns the integer result of the comparison 203 */ 204 205 int 206 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) { 207 register int tmp; 208 209 if (len <= 0) return(0); 210 if (str1 == str2) return(0); 211 if (str1 == NULL) return(-1); 212 if (str2 == NULL) return(1); 213 #ifdef __GNUC__ 214 tmp = strncmp((const char *)str1, (const char *)str2, len); 215 return tmp; 216 #else 217 do { 218 tmp = *str1++ - *str2; 219 if (tmp != 0 || --len == 0) return(tmp); 220 } while (*str2++ != 0); 221 return 0; 222 #endif 223 } 224 225 static const xmlChar casemap[256] = { 226 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 227 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, 228 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 229 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, 230 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27, 231 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, 232 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 233 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, 234 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 235 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, 236 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 237 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F, 238 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 239 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, 240 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 241 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F, 242 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 243 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F, 244 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 245 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F, 246 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7, 247 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF, 248 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7, 249 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF, 250 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7, 251 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF, 252 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7, 253 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, 254 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7, 255 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, 256 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7, 257 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF 258 }; 259 260 /** 261 * xmlStrcasecmp: 262 * @str1: the first xmlChar * 263 * @str2: the second xmlChar * 264 * 265 * a strcasecmp for xmlChar's 266 * 267 * Returns the integer result of the comparison 268 */ 269 270 int 271 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) { 272 register int tmp; 273 274 if (str1 == str2) return(0); 275 if (str1 == NULL) return(-1); 276 if (str2 == NULL) return(1); 277 do { 278 tmp = casemap[*str1++] - casemap[*str2]; 279 if (tmp != 0) return(tmp); 280 } while (*str2++ != 0); 281 return 0; 282 } 283 284 /** 285 * xmlStrncasecmp: 286 * @str1: the first xmlChar * 287 * @str2: the second xmlChar * 288 * @len: the max comparison length 289 * 290 * a strncasecmp for xmlChar's 291 * 292 * Returns the integer result of the comparison 293 */ 294 295 int 296 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) { 297 register int tmp; 298 299 if (len <= 0) return(0); 300 if (str1 == str2) return(0); 301 if (str1 == NULL) return(-1); 302 if (str2 == NULL) return(1); 303 do { 304 tmp = casemap[*str1++] - casemap[*str2]; 305 if (tmp != 0 || --len == 0) return(tmp); 306 } while (*str2++ != 0); 307 return 0; 308 } 309 310 /** 311 * xmlStrchr: 312 * @str: the xmlChar * array 313 * @val: the xmlChar to search 314 * 315 * a strchr for xmlChar's 316 * 317 * Returns the xmlChar * for the first occurrence or NULL. 318 */ 319 320 const xmlChar * 321 xmlStrchr(const xmlChar *str, xmlChar val) { 322 if (str == NULL) return(NULL); 323 while (*str != 0) { /* non input consuming */ 324 if (*str == val) return((xmlChar *) str); 325 str++; 326 } 327 return(NULL); 328 } 329 330 /** 331 * xmlStrstr: 332 * @str: the xmlChar * array (haystack) 333 * @val: the xmlChar to search (needle) 334 * 335 * a strstr for xmlChar's 336 * 337 * Returns the xmlChar * for the first occurrence or NULL. 338 */ 339 340 const xmlChar * 341 xmlStrstr(const xmlChar *str, const xmlChar *val) { 342 int n; 343 344 if (str == NULL) return(NULL); 345 if (val == NULL) return(NULL); 346 n = xmlStrlen(val); 347 348 if (n == 0) return(str); 349 while (*str != 0) { /* non input consuming */ 350 if (*str == *val) { 351 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str); 352 } 353 str++; 354 } 355 return(NULL); 356 } 357 358 /** 359 * xmlStrcasestr: 360 * @str: the xmlChar * array (haystack) 361 * @val: the xmlChar to search (needle) 362 * 363 * a case-ignoring strstr for xmlChar's 364 * 365 * Returns the xmlChar * for the first occurrence or NULL. 366 */ 367 368 const xmlChar * 369 xmlStrcasestr(const xmlChar *str, const xmlChar *val) { 370 int n; 371 372 if (str == NULL) return(NULL); 373 if (val == NULL) return(NULL); 374 n = xmlStrlen(val); 375 376 if (n == 0) return(str); 377 while (*str != 0) { /* non input consuming */ 378 if (casemap[*str] == casemap[*val]) 379 if (!xmlStrncasecmp(str, val, n)) return(str); 380 str++; 381 } 382 return(NULL); 383 } 384 385 /** 386 * xmlStrsub: 387 * @str: the xmlChar * array (haystack) 388 * @start: the index of the first char (zero based) 389 * @len: the length of the substring 390 * 391 * Extract a substring of a given string 392 * 393 * Returns the xmlChar * for the first occurrence or NULL. 394 */ 395 396 xmlChar * 397 xmlStrsub(const xmlChar *str, int start, int len) { 398 int i; 399 400 if (str == NULL) return(NULL); 401 if (start < 0) return(NULL); 402 if (len < 0) return(NULL); 403 404 for (i = 0;i < start;i++) { 405 if (*str == 0) return(NULL); 406 str++; 407 } 408 if (*str == 0) return(NULL); 409 return(xmlStrndup(str, len)); 410 } 411 412 /** 413 * xmlStrlen: 414 * @str: the xmlChar * array 415 * 416 * length of a xmlChar's string 417 * 418 * Returns the number of xmlChar contained in the ARRAY. 419 */ 420 421 int 422 xmlStrlen(const xmlChar *str) { 423 int len = 0; 424 425 if (str == NULL) return(0); 426 while (*str != 0) { /* non input consuming */ 427 str++; 428 len++; 429 } 430 return(len); 431 } 432 433 /** 434 * xmlStrncat: 435 * @cur: the original xmlChar * array 436 * @add: the xmlChar * array added 437 * @len: the length of @add 438 * 439 * a strncat for array of xmlChar's, it will extend @cur with the len 440 * first bytes of @add. Note that if @len < 0 then this is an API error 441 * and NULL will be returned. 442 * 443 * Returns a new xmlChar *, the original @cur is reallocated if needed 444 * and should not be freed 445 */ 446 447 xmlChar * 448 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) { 449 int size; 450 xmlChar *ret; 451 452 if ((add == NULL) || (len == 0)) 453 return(cur); 454 if (len < 0) 455 return(NULL); 456 if (cur == NULL) 457 return(xmlStrndup(add, len)); 458 459 size = xmlStrlen(cur); 460 if (size < 0) 461 return(NULL); 462 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar)); 463 if (ret == NULL) { 464 xmlErrMemory(NULL, NULL); 465 return(cur); 466 } 467 memcpy(&ret[size], add, len * sizeof(xmlChar)); 468 ret[size + len] = 0; 469 return(ret); 470 } 471 472 /** 473 * xmlStrncatNew: 474 * @str1: first xmlChar string 475 * @str2: second xmlChar string 476 * @len: the len of @str2 or < 0 477 * 478 * same as xmlStrncat, but creates a new string. The original 479 * two strings are not freed. If @len is < 0 then the length 480 * will be calculated automatically. 481 * 482 * Returns a new xmlChar * or NULL 483 */ 484 xmlChar * 485 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) { 486 int size; 487 xmlChar *ret; 488 489 if (len < 0) { 490 len = xmlStrlen(str2); 491 if (len < 0) 492 return(NULL); 493 } 494 if ((str2 == NULL) || (len == 0)) 495 return(xmlStrdup(str1)); 496 if (str1 == NULL) 497 return(xmlStrndup(str2, len)); 498 499 size = xmlStrlen(str1); 500 if (size < 0) 501 return(NULL); 502 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar)); 503 if (ret == NULL) { 504 xmlErrMemory(NULL, NULL); 505 return(xmlStrndup(str1, size)); 506 } 507 memcpy(ret, str1, size * sizeof(xmlChar)); 508 memcpy(&ret[size], str2, len * sizeof(xmlChar)); 509 ret[size + len] = 0; 510 return(ret); 511 } 512 513 /** 514 * xmlStrcat: 515 * @cur: the original xmlChar * array 516 * @add: the xmlChar * array added 517 * 518 * a strcat for array of xmlChar's. Since they are supposed to be 519 * encoded in UTF-8 or an encoding with 8bit based chars, we assume 520 * a termination mark of '0'. 521 * 522 * Returns a new xmlChar * containing the concatenated string. 523 */ 524 xmlChar * 525 xmlStrcat(xmlChar *cur, const xmlChar *add) { 526 const xmlChar *p = add; 527 528 if (add == NULL) return(cur); 529 if (cur == NULL) 530 return(xmlStrdup(add)); 531 532 while (*p != 0) p++; /* non input consuming */ 533 return(xmlStrncat(cur, add, p - add)); 534 } 535 536 /** 537 * xmlStrPrintf: 538 * @buf: the result buffer. 539 * @len: the result buffer length. 540 * @msg: the message with printf formatting. 541 * @...: extra parameters for the message. 542 * 543 * Formats @msg and places result into @buf. 544 * 545 * Returns the number of characters written to @buf or -1 if an error occurs. 546 */ 547 int XMLCDECL 548 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) { 549 va_list args; 550 int ret; 551 552 if((buf == NULL) || (msg == NULL)) { 553 return(-1); 554 } 555 556 va_start(args, msg); 557 ret = vsnprintf((char *) buf, len, (const char *) msg, args); 558 va_end(args); 559 buf[len - 1] = 0; /* be safe ! */ 560 561 return(ret); 562 } 563 564 /** 565 * xmlStrVPrintf: 566 * @buf: the result buffer. 567 * @len: the result buffer length. 568 * @msg: the message with printf formatting. 569 * @ap: extra parameters for the message. 570 * 571 * Formats @msg and places result into @buf. 572 * 573 * Returns the number of characters written to @buf or -1 if an error occurs. 574 */ 575 int 576 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) { 577 int ret; 578 579 if((buf == NULL) || (msg == NULL)) { 580 return(-1); 581 } 582 583 ret = vsnprintf((char *) buf, len, (const char *) msg, ap); 584 buf[len - 1] = 0; /* be safe ! */ 585 586 return(ret); 587 } 588 589 /************************************************************************ 590 * * 591 * Generic UTF8 handling routines * 592 * * 593 * From rfc2044: encoding of the Unicode values on UTF-8: * 594 * * 595 * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 596 * 0000 0000-0000 007F 0xxxxxxx * 597 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 598 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * 599 * * 600 * I hope we won't use values > 0xFFFF anytime soon ! * 601 * * 602 ************************************************************************/ 603 604 605 /** 606 * xmlUTF8Size: 607 * @utf: pointer to the UTF8 character 608 * 609 * calculates the internal size of a UTF8 character 610 * 611 * returns the numbers of bytes in the character, -1 on format error 612 */ 613 int 614 xmlUTF8Size(const xmlChar *utf) { 615 xmlChar mask; 616 int len; 617 618 if (utf == NULL) 619 return -1; 620 if (*utf < 0x80) 621 return 1; 622 /* check valid UTF8 character */ 623 if (!(*utf & 0x40)) 624 return -1; 625 /* determine number of bytes in char */ 626 len = 2; 627 for (mask=0x20; mask != 0; mask>>=1) { 628 if (!(*utf & mask)) 629 return len; 630 len++; 631 } 632 return -1; 633 } 634 635 /** 636 * xmlUTF8Charcmp: 637 * @utf1: pointer to first UTF8 char 638 * @utf2: pointer to second UTF8 char 639 * 640 * compares the two UCS4 values 641 * 642 * returns result of the compare as with xmlStrncmp 643 */ 644 int 645 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) { 646 647 if (utf1 == NULL ) { 648 if (utf2 == NULL) 649 return 0; 650 return -1; 651 } 652 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1)); 653 } 654 655 /** 656 * xmlUTF8Strlen: 657 * @utf: a sequence of UTF-8 encoded bytes 658 * 659 * compute the length of an UTF8 string, it doesn't do a full UTF8 660 * checking of the content of the string. 661 * 662 * Returns the number of characters in the string or -1 in case of error 663 */ 664 int 665 xmlUTF8Strlen(const xmlChar *utf) { 666 int ret = 0; 667 668 if (utf == NULL) 669 return(-1); 670 671 while (*utf != 0) { 672 if (utf[0] & 0x80) { 673 if ((utf[1] & 0xc0) != 0x80) 674 return(-1); 675 if ((utf[0] & 0xe0) == 0xe0) { 676 if ((utf[2] & 0xc0) != 0x80) 677 return(-1); 678 if ((utf[0] & 0xf0) == 0xf0) { 679 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) 680 return(-1); 681 utf += 4; 682 } else { 683 utf += 3; 684 } 685 } else { 686 utf += 2; 687 } 688 } else { 689 utf++; 690 } 691 ret++; 692 } 693 return(ret); 694 } 695 696 /** 697 * xmlGetUTF8Char: 698 * @utf: a sequence of UTF-8 encoded bytes 699 * @len: a pointer to the minimum number of bytes present in 700 * the sequence. This is used to assure the next character 701 * is completely contained within the sequence. 702 * 703 * Read the first UTF8 character from @utf 704 * 705 * Returns the char value or -1 in case of error, and sets *len to 706 * the actual number of bytes consumed (0 in case of error) 707 */ 708 int 709 xmlGetUTF8Char(const unsigned char *utf, int *len) { 710 unsigned int c; 711 712 if (utf == NULL) 713 goto error; 714 if (len == NULL) 715 goto error; 716 if (*len < 1) 717 goto error; 718 719 c = utf[0]; 720 if (c & 0x80) { 721 if (*len < 2) 722 goto error; 723 if ((utf[1] & 0xc0) != 0x80) 724 goto error; 725 if ((c & 0xe0) == 0xe0) { 726 if (*len < 3) 727 goto error; 728 if ((utf[2] & 0xc0) != 0x80) 729 goto error; 730 if ((c & 0xf0) == 0xf0) { 731 if (*len < 4) 732 goto error; 733 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) 734 goto error; 735 *len = 4; 736 /* 4-byte code */ 737 c = (utf[0] & 0x7) << 18; 738 c |= (utf[1] & 0x3f) << 12; 739 c |= (utf[2] & 0x3f) << 6; 740 c |= utf[3] & 0x3f; 741 } else { 742 /* 3-byte code */ 743 *len = 3; 744 c = (utf[0] & 0xf) << 12; 745 c |= (utf[1] & 0x3f) << 6; 746 c |= utf[2] & 0x3f; 747 } 748 } else { 749 /* 2-byte code */ 750 *len = 2; 751 c = (utf[0] & 0x1f) << 6; 752 c |= utf[1] & 0x3f; 753 } 754 } else { 755 /* 1-byte code */ 756 *len = 1; 757 } 758 return(c); 759 760 error: 761 if (len != NULL) 762 *len = 0; 763 return(-1); 764 } 765 766 /** 767 * xmlCheckUTF8: 768 * @utf: Pointer to putative UTF-8 encoded string. 769 * 770 * Checks @utf for being valid UTF-8. @utf is assumed to be 771 * null-terminated. This function is not super-strict, as it will 772 * allow longer UTF-8 sequences than necessary. Note that Java is 773 * capable of producing these sequences if provoked. Also note, this 774 * routine checks for the 4-byte maximum size, but does not check for 775 * 0x10ffff maximum value. 776 * 777 * Return value: true if @utf is valid. 778 **/ 779 int 780 xmlCheckUTF8(const unsigned char *utf) 781 { 782 int ix; 783 unsigned char c; 784 785 if (utf == NULL) 786 return(0); 787 /* 788 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings 789 * are as follows (in "bit format"): 790 * 0xxxxxxx valid 1-byte 791 * 110xxxxx 10xxxxxx valid 2-byte 792 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte 793 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte 794 */ 795 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */ 796 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */ 797 ix++; 798 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */ 799 if ((utf[ix+1] & 0xc0 ) != 0x80) 800 return 0; 801 ix += 2; 802 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */ 803 if (((utf[ix+1] & 0xc0) != 0x80) || 804 ((utf[ix+2] & 0xc0) != 0x80)) 805 return 0; 806 ix += 3; 807 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */ 808 if (((utf[ix+1] & 0xc0) != 0x80) || 809 ((utf[ix+2] & 0xc0) != 0x80) || 810 ((utf[ix+3] & 0xc0) != 0x80)) 811 return 0; 812 ix += 4; 813 } else /* unknown encoding */ 814 return 0; 815 } 816 return(1); 817 } 818 819 /** 820 * xmlUTF8Strsize: 821 * @utf: a sequence of UTF-8 encoded bytes 822 * @len: the number of characters in the array 823 * 824 * storage size of an UTF8 string 825 * the behaviour is not garanteed if the input string is not UTF-8 826 * 827 * Returns the storage size of 828 * the first 'len' characters of ARRAY 829 */ 830 831 int 832 xmlUTF8Strsize(const xmlChar *utf, int len) { 833 const xmlChar *ptr=utf; 834 xmlChar ch; 835 836 if (utf == NULL) 837 return(0); 838 839 if (len <= 0) 840 return(0); 841 842 while ( len-- > 0) { 843 if ( !*ptr ) 844 break; 845 if ( (ch = *ptr++) & 0x80) 846 while ((ch<<=1) & 0x80 ) { 847 if (*ptr == 0) break; 848 ptr++; 849 } 850 } 851 return (ptr - utf); 852 } 853 854 855 /** 856 * xmlUTF8Strndup: 857 * @utf: the input UTF8 * 858 * @len: the len of @utf (in chars) 859 * 860 * a strndup for array of UTF8's 861 * 862 * Returns a new UTF8 * or NULL 863 */ 864 xmlChar * 865 xmlUTF8Strndup(const xmlChar *utf, int len) { 866 xmlChar *ret; 867 int i; 868 869 if ((utf == NULL) || (len < 0)) return(NULL); 870 i = xmlUTF8Strsize(utf, len); 871 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar)); 872 if (ret == NULL) { 873 xmlGenericError(xmlGenericErrorContext, 874 "malloc of %ld byte failed\n", 875 (len + 1) * (long)sizeof(xmlChar)); 876 return(NULL); 877 } 878 memcpy(ret, utf, i * sizeof(xmlChar)); 879 ret[i] = 0; 880 return(ret); 881 } 882 883 /** 884 * xmlUTF8Strpos: 885 * @utf: the input UTF8 * 886 * @pos: the position of the desired UTF8 char (in chars) 887 * 888 * a function to provide the equivalent of fetching a 889 * character from a string array 890 * 891 * Returns a pointer to the UTF8 character or NULL 892 */ 893 const xmlChar * 894 xmlUTF8Strpos(const xmlChar *utf, int pos) { 895 xmlChar ch; 896 897 if (utf == NULL) return(NULL); 898 if (pos < 0) 899 return(NULL); 900 while (pos--) { 901 if ((ch=*utf++) == 0) return(NULL); 902 if ( ch & 0x80 ) { 903 /* if not simple ascii, verify proper format */ 904 if ( (ch & 0xc0) != 0xc0 ) 905 return(NULL); 906 /* then skip over remaining bytes for this char */ 907 while ( (ch <<= 1) & 0x80 ) 908 if ( (*utf++ & 0xc0) != 0x80 ) 909 return(NULL); 910 } 911 } 912 return((xmlChar *)utf); 913 } 914 915 /** 916 * xmlUTF8Strloc: 917 * @utf: the input UTF8 * 918 * @utfchar: the UTF8 character to be found 919 * 920 * a function to provide the relative location of a UTF8 char 921 * 922 * Returns the relative character position of the desired char 923 * or -1 if not found 924 */ 925 int 926 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) { 927 int i, size; 928 xmlChar ch; 929 930 if (utf==NULL || utfchar==NULL) return -1; 931 size = xmlUTF8Strsize(utfchar, 1); 932 for(i=0; (ch=*utf) != 0; i++) { 933 if (xmlStrncmp(utf, utfchar, size)==0) 934 return(i); 935 utf++; 936 if ( ch & 0x80 ) { 937 /* if not simple ascii, verify proper format */ 938 if ( (ch & 0xc0) != 0xc0 ) 939 return(-1); 940 /* then skip over remaining bytes for this char */ 941 while ( (ch <<= 1) & 0x80 ) 942 if ( (*utf++ & 0xc0) != 0x80 ) 943 return(-1); 944 } 945 } 946 947 return(-1); 948 } 949 /** 950 * xmlUTF8Strsub: 951 * @utf: a sequence of UTF-8 encoded bytes 952 * @start: relative pos of first char 953 * @len: total number to copy 954 * 955 * Create a substring from a given UTF-8 string 956 * Note: positions are given in units of UTF-8 chars 957 * 958 * Returns a pointer to a newly created string 959 * or NULL if any problem 960 */ 961 962 xmlChar * 963 xmlUTF8Strsub(const xmlChar *utf, int start, int len) { 964 int i; 965 xmlChar ch; 966 967 if (utf == NULL) return(NULL); 968 if (start < 0) return(NULL); 969 if (len < 0) return(NULL); 970 971 /* 972 * Skip over any leading chars 973 */ 974 for (i = 0;i < start;i++) { 975 if ((ch=*utf++) == 0) return(NULL); 976 if ( ch & 0x80 ) { 977 /* if not simple ascii, verify proper format */ 978 if ( (ch & 0xc0) != 0xc0 ) 979 return(NULL); 980 /* then skip over remaining bytes for this char */ 981 while ( (ch <<= 1) & 0x80 ) 982 if ( (*utf++ & 0xc0) != 0x80 ) 983 return(NULL); 984 } 985 } 986 987 return(xmlUTF8Strndup(utf, len)); 988 } 989 990 /** 991 * xmlEscapeFormatString: 992 * @msg: a pointer to the string in which to escape '%' characters. 993 * Must be a heap-allocated buffer created by libxml2 that may be 994 * returned, or that may be freed and replaced. 995 * 996 * Replaces the string pointed to by 'msg' with an escaped string. 997 * Returns the same string with all '%' characters escaped. 998 */ 999 xmlChar * 1000 xmlEscapeFormatString(xmlChar **msg) 1001 { 1002 xmlChar *msgPtr = NULL; 1003 xmlChar *result = NULL; 1004 xmlChar *resultPtr = NULL; 1005 size_t count = 0; 1006 size_t msgLen = 0; 1007 size_t resultLen = 0; 1008 1009 if (!msg || !*msg) 1010 return(NULL); 1011 1012 for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) { 1013 ++msgLen; 1014 if (*msgPtr == '%') 1015 ++count; 1016 } 1017 1018 if (count == 0) 1019 return(*msg); 1020 1021 resultLen = msgLen + count + 1; 1022 result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar)); 1023 if (result == NULL) { 1024 /* Clear *msg to prevent format string vulnerabilities in 1025 out-of-memory situations. */ 1026 xmlFree(*msg); 1027 *msg = NULL; 1028 xmlErrMemory(NULL, NULL); 1029 return(NULL); 1030 } 1031 1032 for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) { 1033 *resultPtr = *msgPtr; 1034 if (*msgPtr == '%') 1035 *(++resultPtr) = '%'; 1036 } 1037 result[resultLen - 1] = '\0'; 1038 1039 xmlFree(*msg); 1040 *msg = result; 1041 1042 return *msg; 1043 } 1044 1045 #define bottom_xmlstring 1046 #include "elfgcchack.h" 1047