1/* 2Licensed under the Apache License, Version 2.0 (the "License"); 3you may not use this file except in compliance with the License. 4You may obtain a copy of the License at 5 6 http://www.apache.org/licenses/LICENSE-2.0 7 8Unless required by applicable law or agreed to in writing, software 9distributed under the License is distributed on an "AS IS" BASIS, 10WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11See the License for the specific language governing permissions and 12limitations under the License. 13*/ 14 15package candiedyaml 16 17import ( 18 "io" 19) 20 21/* 22 * Set the reader error and return 0. 23 */ 24 25func yaml_parser_set_reader_error(parser *yaml_parser_t, problem string, 26 offset int, value int) bool { 27 parser.error = yaml_READER_ERROR 28 parser.problem = problem 29 parser.problem_offset = offset 30 parser.problem_value = value 31 32 return false 33} 34 35/* 36 * Byte order marks. 37 */ 38const ( 39 BOM_UTF8 = "\xef\xbb\xbf" 40 BOM_UTF16LE = "\xff\xfe" 41 BOM_UTF16BE = "\xfe\xff" 42) 43 44/* 45 * Determine the input stream encoding by checking the BOM symbol. If no BOM is 46 * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure. 47 */ 48 49func yaml_parser_determine_encoding(parser *yaml_parser_t) bool { 50 /* Ensure that we had enough bytes in the raw buffer. */ 51 for !parser.eof && 52 len(parser.raw_buffer)-parser.raw_buffer_pos < 3 { 53 if !yaml_parser_update_raw_buffer(parser) { 54 return false 55 } 56 } 57 58 /* Determine the encoding. */ 59 raw := parser.raw_buffer 60 pos := parser.raw_buffer_pos 61 remaining := len(raw) - pos 62 if remaining >= 2 && 63 raw[pos] == BOM_UTF16LE[0] && raw[pos+1] == BOM_UTF16LE[1] { 64 parser.encoding = yaml_UTF16LE_ENCODING 65 parser.raw_buffer_pos += 2 66 parser.offset += 2 67 } else if remaining >= 2 && 68 raw[pos] == BOM_UTF16BE[0] && raw[pos+1] == BOM_UTF16BE[1] { 69 parser.encoding = yaml_UTF16BE_ENCODING 70 parser.raw_buffer_pos += 2 71 parser.offset += 2 72 } else if remaining >= 3 && 73 raw[pos] == BOM_UTF8[0] && raw[pos+1] == BOM_UTF8[1] && raw[pos+2] == BOM_UTF8[2] { 74 parser.encoding = yaml_UTF8_ENCODING 75 parser.raw_buffer_pos += 3 76 parser.offset += 3 77 } else { 78 parser.encoding = yaml_UTF8_ENCODING 79 } 80 81 return true 82} 83 84/* 85 * Update the raw buffer. 86 */ 87 88func yaml_parser_update_raw_buffer(parser *yaml_parser_t) bool { 89 size_read := 0 90 91 /* Return if the raw buffer is full. */ 92 if parser.raw_buffer_pos == 0 && len(parser.raw_buffer) == cap(parser.raw_buffer) { 93 return true 94 } 95 96 /* Return on EOF. */ 97 98 if parser.eof { 99 return true 100 } 101 102 /* Move the remaining bytes in the raw buffer to the beginning. */ 103 if parser.raw_buffer_pos > 0 && parser.raw_buffer_pos < len(parser.raw_buffer) { 104 copy(parser.raw_buffer, parser.raw_buffer[parser.raw_buffer_pos:]) 105 } 106 parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)-parser.raw_buffer_pos] 107 parser.raw_buffer_pos = 0 108 109 /* Call the read handler to fill the buffer. */ 110 size_read, err := parser.read_handler(parser, 111 parser.raw_buffer[len(parser.raw_buffer):cap(parser.raw_buffer)]) 112 parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)+size_read] 113 114 if err == io.EOF { 115 parser.eof = true 116 } else if err != nil { 117 return yaml_parser_set_reader_error(parser, "input error: "+err.Error(), 118 parser.offset, -1) 119 } 120 121 return true 122} 123 124/* 125 * Ensure that the buffer contains at least `length` characters. 126 * Return 1 on success, 0 on failure. 127 * 128 * The length is supposed to be significantly less that the buffer size. 129 */ 130 131func yaml_parser_update_buffer(parser *yaml_parser_t, length int) bool { 132 /* Read handler must be set. */ 133 if parser.read_handler == nil { 134 panic("read handler must be set") 135 } 136 137 /* If the EOF flag is set and the raw buffer is empty, do nothing. */ 138 139 if parser.eof && parser.raw_buffer_pos == len(parser.raw_buffer) { 140 return true 141 } 142 143 /* Return if the buffer contains enough characters. */ 144 145 if parser.unread >= length { 146 return true 147 } 148 149 /* Determine the input encoding if it is not known yet. */ 150 151 if parser.encoding == yaml_ANY_ENCODING { 152 if !yaml_parser_determine_encoding(parser) { 153 return false 154 } 155 } 156 157 /* Move the unread characters to the beginning of the buffer. */ 158 buffer_end := len(parser.buffer) 159 if 0 < parser.buffer_pos && 160 parser.buffer_pos < buffer_end { 161 copy(parser.buffer, parser.buffer[parser.buffer_pos:]) 162 buffer_end -= parser.buffer_pos 163 parser.buffer_pos = 0 164 } else if parser.buffer_pos == buffer_end { 165 buffer_end = 0 166 parser.buffer_pos = 0 167 } 168 169 parser.buffer = parser.buffer[:cap(parser.buffer)] 170 171 /* Fill the buffer until it has enough characters. */ 172 first := true 173 for parser.unread < length { 174 /* Fill the raw buffer if necessary. */ 175 176 if !first || parser.raw_buffer_pos == len(parser.raw_buffer) { 177 if !yaml_parser_update_raw_buffer(parser) { 178 parser.buffer = parser.buffer[:buffer_end] 179 return false 180 } 181 } 182 first = false 183 184 /* Decode the raw buffer. */ 185 for parser.raw_buffer_pos != len(parser.raw_buffer) { 186 var value rune 187 var w int 188 189 raw_unread := len(parser.raw_buffer) - parser.raw_buffer_pos 190 incomplete := false 191 192 /* Decode the next character. */ 193 194 switch parser.encoding { 195 case yaml_UTF8_ENCODING: 196 197 /* 198 * Decode a UTF-8 character. Check RFC 3629 199 * (http://www.ietf.org/rfc/rfc3629.txt) for more details. 200 * 201 * The following table (taken from the RFC) is used for 202 * decoding. 203 * 204 * Char. number range | UTF-8 octet sequence 205 * (hexadecimal) | (binary) 206 * --------------------+------------------------------------ 207 * 0000 0000-0000 007F | 0xxxxxxx 208 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 209 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 210 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 211 * 212 * Additionally, the characters in the range 0xD800-0xDFFF 213 * are prohibited as they are reserved for use with UTF-16 214 * surrogate pairs. 215 */ 216 217 /* Determine the length of the UTF-8 sequence. */ 218 219 octet := parser.raw_buffer[parser.raw_buffer_pos] 220 w = width(octet) 221 222 /* Check if the leading octet is valid. */ 223 224 if w == 0 { 225 return yaml_parser_set_reader_error(parser, 226 "invalid leading UTF-8 octet", 227 parser.offset, int(octet)) 228 } 229 230 /* Check if the raw buffer contains an incomplete character. */ 231 232 if w > raw_unread { 233 if parser.eof { 234 return yaml_parser_set_reader_error(parser, 235 "incomplete UTF-8 octet sequence", 236 parser.offset, -1) 237 } 238 incomplete = true 239 break 240 } 241 242 /* Decode the leading octet. */ 243 switch { 244 case octet&0x80 == 0x00: 245 value = rune(octet & 0x7F) 246 case octet&0xE0 == 0xC0: 247 value = rune(octet & 0x1F) 248 case octet&0xF0 == 0xE0: 249 value = rune(octet & 0x0F) 250 case octet&0xF8 == 0xF0: 251 value = rune(octet & 0x07) 252 default: 253 value = 0 254 } 255 256 /* Check and decode the trailing octets. */ 257 258 for k := 1; k < w; k++ { 259 octet = parser.raw_buffer[parser.raw_buffer_pos+k] 260 261 /* Check if the octet is valid. */ 262 263 if (octet & 0xC0) != 0x80 { 264 return yaml_parser_set_reader_error(parser, 265 "invalid trailing UTF-8 octet", 266 parser.offset+k, int(octet)) 267 } 268 269 /* Decode the octet. */ 270 271 value = (value << 6) + rune(octet&0x3F) 272 } 273 274 /* Check the length of the sequence against the value. */ 275 switch { 276 case w == 1: 277 case w == 2 && value >= 0x80: 278 case w == 3 && value >= 0x800: 279 case w == 4 && value >= 0x10000: 280 default: 281 return yaml_parser_set_reader_error(parser, 282 "invalid length of a UTF-8 sequence", 283 parser.offset, -1) 284 } 285 286 /* Check the range of the value. */ 287 288 if (value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF { 289 return yaml_parser_set_reader_error(parser, 290 "invalid Unicode character", 291 parser.offset, int(value)) 292 } 293 case yaml_UTF16LE_ENCODING, 294 yaml_UTF16BE_ENCODING: 295 296 var low, high int 297 if parser.encoding == yaml_UTF16LE_ENCODING { 298 low, high = 0, 1 299 } else { 300 high, low = 1, 0 301 } 302 303 /* 304 * The UTF-16 encoding is not as simple as one might 305 * naively think. Check RFC 2781 306 * (http://www.ietf.org/rfc/rfc2781.txt). 307 * 308 * Normally, two subsequent bytes describe a Unicode 309 * character. However a special technique (called a 310 * surrogate pair) is used for specifying character 311 * values larger than 0xFFFF. 312 * 313 * A surrogate pair consists of two pseudo-characters: 314 * high surrogate area (0xD800-0xDBFF) 315 * low surrogate area (0xDC00-0xDFFF) 316 * 317 * The following formulas are used for decoding 318 * and encoding characters using surrogate pairs: 319 * 320 * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF) 321 * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF) 322 * W1 = 110110yyyyyyyyyy 323 * W2 = 110111xxxxxxxxxx 324 * 325 * where U is the character value, W1 is the high surrogate 326 * area, W2 is the low surrogate area. 327 */ 328 329 /* Check for incomplete UTF-16 character. */ 330 331 if raw_unread < 2 { 332 if parser.eof { 333 return yaml_parser_set_reader_error(parser, 334 "incomplete UTF-16 character", 335 parser.offset, -1) 336 } 337 incomplete = true 338 break 339 } 340 341 /* Get the character. */ 342 value = rune(parser.raw_buffer[parser.raw_buffer_pos+low]) + 343 (rune(parser.raw_buffer[parser.raw_buffer_pos+high]) << 8) 344 345 /* Check for unexpected low surrogate area. */ 346 347 if (value & 0xFC00) == 0xDC00 { 348 return yaml_parser_set_reader_error(parser, 349 "unexpected low surrogate area", 350 parser.offset, int(value)) 351 } 352 353 /* Check for a high surrogate area. */ 354 355 if (value & 0xFC00) == 0xD800 { 356 357 w = 4 358 359 /* Check for incomplete surrogate pair. */ 360 361 if raw_unread < 4 { 362 if parser.eof { 363 return yaml_parser_set_reader_error(parser, 364 "incomplete UTF-16 surrogate pair", 365 parser.offset, -1) 366 } 367 incomplete = true 368 break 369 } 370 371 /* Get the next character. */ 372 373 value2 := rune(parser.raw_buffer[parser.raw_buffer_pos+low+2]) + 374 (rune(parser.raw_buffer[parser.raw_buffer_pos+high+2]) << 8) 375 376 /* Check for a low surrogate area. */ 377 378 if (value2 & 0xFC00) != 0xDC00 { 379 return yaml_parser_set_reader_error(parser, 380 "expected low surrogate area", 381 parser.offset+2, int(value2)) 382 } 383 384 /* Generate the value of the surrogate pair. */ 385 386 value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF) 387 } else { 388 w = 2 389 } 390 391 break 392 393 default: 394 panic("Impossible") /* Impossible. */ 395 } 396 397 /* Check if the raw buffer contains enough bytes to form a character. */ 398 399 if incomplete { 400 break 401 } 402 403 /* 404 * Check if the character is in the allowed range: 405 * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) 406 * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) 407 * | [#x10000-#x10FFFF] (32 bit) 408 */ 409 410 if !(value == 0x09 || value == 0x0A || value == 0x0D || 411 (value >= 0x20 && value <= 0x7E) || 412 (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) || 413 (value >= 0xE000 && value <= 0xFFFD) || 414 (value >= 0x10000 && value <= 0x10FFFF)) { 415 return yaml_parser_set_reader_error(parser, 416 "control characters are not allowed", 417 parser.offset, int(value)) 418 } 419 420 /* Move the raw pointers. */ 421 422 parser.raw_buffer_pos += w 423 parser.offset += w 424 425 /* Finally put the character into the buffer. */ 426 427 /* 0000 0000-0000 007F . 0xxxxxxx */ 428 if value <= 0x7F { 429 parser.buffer[buffer_end] = byte(value) 430 } else if value <= 0x7FF { 431 /* 0000 0080-0000 07FF . 110xxxxx 10xxxxxx */ 432 parser.buffer[buffer_end] = byte(0xC0 + (value >> 6)) 433 parser.buffer[buffer_end+1] = byte(0x80 + (value & 0x3F)) 434 } else if value <= 0xFFFF { 435 /* 0000 0800-0000 FFFF . 1110xxxx 10xxxxxx 10xxxxxx */ 436 parser.buffer[buffer_end] = byte(0xE0 + (value >> 12)) 437 parser.buffer[buffer_end+1] = byte(0x80 + ((value >> 6) & 0x3F)) 438 parser.buffer[buffer_end+2] = byte(0x80 + (value & 0x3F)) 439 } else { 440 /* 0001 0000-0010 FFFF . 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 441 parser.buffer[buffer_end] = byte(0xF0 + (value >> 18)) 442 parser.buffer[buffer_end+1] = byte(0x80 + ((value >> 12) & 0x3F)) 443 parser.buffer[buffer_end+2] = byte(0x80 + ((value >> 6) & 0x3F)) 444 parser.buffer[buffer_end+3] = byte(0x80 + (value & 0x3F)) 445 } 446 447 buffer_end += w 448 parser.unread++ 449 } 450 451 /* On EOF, put NUL into the buffer and return. */ 452 453 if parser.eof { 454 parser.buffer[buffer_end] = 0 455 buffer_end++ 456 parser.buffer = parser.buffer[:buffer_end] 457 parser.unread++ 458 return true 459 } 460 461 } 462 463 parser.buffer = parser.buffer[:buffer_end] 464 return true 465} 466