1/*
2Licensed under the Apache License, Version 2.0 (the "License");
3you may not use this file except in compliance with the License.
4You may obtain a copy of the License at
5
6    http://www.apache.org/licenses/LICENSE-2.0
7
8Unless required by applicable law or agreed to in writing, software
9distributed under the License is distributed on an "AS IS" BASIS,
10WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11See the License for the specific language governing permissions and
12limitations under the License.
13*/
14
15package candiedyaml
16
17import (
18	"io"
19)
20
21/*
22 * Set the reader error and return 0.
23 */
24
25func yaml_parser_set_reader_error(parser *yaml_parser_t, problem string,
26	offset int, value int) bool {
27	parser.error = yaml_READER_ERROR
28	parser.problem = problem
29	parser.problem_offset = offset
30	parser.problem_value = value
31
32	return false
33}
34
35/*
36 * Byte order marks.
37 */
38const (
39	BOM_UTF8    = "\xef\xbb\xbf"
40	BOM_UTF16LE = "\xff\xfe"
41	BOM_UTF16BE = "\xfe\xff"
42)
43
44/*
45 * Determine the input stream encoding by checking the BOM symbol. If no BOM is
46 * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
47 */
48
49func yaml_parser_determine_encoding(parser *yaml_parser_t) bool {
50	/* Ensure that we had enough bytes in the raw buffer. */
51	for !parser.eof &&
52		len(parser.raw_buffer)-parser.raw_buffer_pos < 3 {
53		if !yaml_parser_update_raw_buffer(parser) {
54			return false
55		}
56	}
57
58	/* Determine the encoding. */
59	raw := parser.raw_buffer
60	pos := parser.raw_buffer_pos
61	remaining := len(raw) - pos
62	if remaining >= 2 &&
63		raw[pos] == BOM_UTF16LE[0] && raw[pos+1] == BOM_UTF16LE[1] {
64		parser.encoding = yaml_UTF16LE_ENCODING
65		parser.raw_buffer_pos += 2
66		parser.offset += 2
67	} else if remaining >= 2 &&
68		raw[pos] == BOM_UTF16BE[0] && raw[pos+1] == BOM_UTF16BE[1] {
69		parser.encoding = yaml_UTF16BE_ENCODING
70		parser.raw_buffer_pos += 2
71		parser.offset += 2
72	} else if remaining >= 3 &&
73		raw[pos] == BOM_UTF8[0] && raw[pos+1] == BOM_UTF8[1] && raw[pos+2] == BOM_UTF8[2] {
74		parser.encoding = yaml_UTF8_ENCODING
75		parser.raw_buffer_pos += 3
76		parser.offset += 3
77	} else {
78		parser.encoding = yaml_UTF8_ENCODING
79	}
80
81	return true
82}
83
84/*
85 * Update the raw buffer.
86 */
87
88func yaml_parser_update_raw_buffer(parser *yaml_parser_t) bool {
89	size_read := 0
90
91	/* Return if the raw buffer is full. */
92	if parser.raw_buffer_pos == 0 && len(parser.raw_buffer) == cap(parser.raw_buffer) {
93		return true
94	}
95
96	/* Return on EOF. */
97
98	if parser.eof {
99		return true
100	}
101
102	/* Move the remaining bytes in the raw buffer to the beginning. */
103	if parser.raw_buffer_pos > 0 && parser.raw_buffer_pos < len(parser.raw_buffer) {
104		copy(parser.raw_buffer, parser.raw_buffer[parser.raw_buffer_pos:])
105	}
106	parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)-parser.raw_buffer_pos]
107	parser.raw_buffer_pos = 0
108
109	/* Call the read handler to fill the buffer. */
110	size_read, err := parser.read_handler(parser,
111		parser.raw_buffer[len(parser.raw_buffer):cap(parser.raw_buffer)])
112	parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)+size_read]
113
114	if err == io.EOF {
115		parser.eof = true
116	} else if err != nil {
117		return yaml_parser_set_reader_error(parser, "input error: "+err.Error(),
118			parser.offset, -1)
119	}
120
121	return true
122}
123
124/*
125 * Ensure that the buffer contains at least `length` characters.
126 * Return 1 on success, 0 on failure.
127 *
128 * The length is supposed to be significantly less that the buffer size.
129 */
130
131func yaml_parser_update_buffer(parser *yaml_parser_t, length int) bool {
132	/* Read handler must be set. */
133	if parser.read_handler == nil {
134		panic("read handler must be set")
135	}
136
137	/* If the EOF flag is set and the raw buffer is empty, do nothing. */
138
139	if parser.eof && parser.raw_buffer_pos == len(parser.raw_buffer) {
140		return true
141	}
142
143	/* Return if the buffer contains enough characters. */
144
145	if parser.unread >= length {
146		return true
147	}
148
149	/* Determine the input encoding if it is not known yet. */
150
151	if parser.encoding == yaml_ANY_ENCODING {
152		if !yaml_parser_determine_encoding(parser) {
153			return false
154		}
155	}
156
157	/* Move the unread characters to the beginning of the buffer. */
158	buffer_end := len(parser.buffer)
159	if 0 < parser.buffer_pos &&
160		parser.buffer_pos < buffer_end {
161		copy(parser.buffer, parser.buffer[parser.buffer_pos:])
162		buffer_end -= parser.buffer_pos
163		parser.buffer_pos = 0
164	} else if parser.buffer_pos == buffer_end {
165		buffer_end = 0
166		parser.buffer_pos = 0
167	}
168
169	parser.buffer = parser.buffer[:cap(parser.buffer)]
170
171	/* Fill the buffer until it has enough characters. */
172	first := true
173	for parser.unread < length {
174		/* Fill the raw buffer if necessary. */
175
176		if !first || parser.raw_buffer_pos == len(parser.raw_buffer) {
177			if !yaml_parser_update_raw_buffer(parser) {
178				parser.buffer = parser.buffer[:buffer_end]
179				return false
180			}
181		}
182		first = false
183
184		/* Decode the raw buffer. */
185		for parser.raw_buffer_pos != len(parser.raw_buffer) {
186			var value rune
187			var w int
188
189			raw_unread := len(parser.raw_buffer) - parser.raw_buffer_pos
190			incomplete := false
191
192			/* Decode the next character. */
193
194			switch parser.encoding {
195			case yaml_UTF8_ENCODING:
196
197				/*
198				 * Decode a UTF-8 character.  Check RFC 3629
199				 * (http://www.ietf.org/rfc/rfc3629.txt) for more details.
200				 *
201				 * The following table (taken from the RFC) is used for
202				 * decoding.
203				 *
204				 *    Char. number range |        UTF-8 octet sequence
205				 *      (hexadecimal)    |              (binary)
206				 *   --------------------+------------------------------------
207				 *   0000 0000-0000 007F | 0xxxxxxx
208				 *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
209				 *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
210				 *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
211				 *
212				 * Additionally, the characters in the range 0xD800-0xDFFF
213				 * are prohibited as they are reserved for use with UTF-16
214				 * surrogate pairs.
215				 */
216
217				/* Determine the length of the UTF-8 sequence. */
218
219				octet := parser.raw_buffer[parser.raw_buffer_pos]
220				w = width(octet)
221
222				/* Check if the leading octet is valid. */
223
224				if w == 0 {
225					return yaml_parser_set_reader_error(parser,
226						"invalid leading UTF-8 octet",
227						parser.offset, int(octet))
228				}
229
230				/* Check if the raw buffer contains an incomplete character. */
231
232				if w > raw_unread {
233					if parser.eof {
234						return yaml_parser_set_reader_error(parser,
235							"incomplete UTF-8 octet sequence",
236							parser.offset, -1)
237					}
238					incomplete = true
239					break
240				}
241
242				/* Decode the leading octet. */
243				switch {
244				case octet&0x80 == 0x00:
245					value = rune(octet & 0x7F)
246				case octet&0xE0 == 0xC0:
247					value = rune(octet & 0x1F)
248				case octet&0xF0 == 0xE0:
249					value = rune(octet & 0x0F)
250				case octet&0xF8 == 0xF0:
251					value = rune(octet & 0x07)
252				default:
253					value = 0
254				}
255
256				/* Check and decode the trailing octets. */
257
258				for k := 1; k < w; k++ {
259					octet = parser.raw_buffer[parser.raw_buffer_pos+k]
260
261					/* Check if the octet is valid. */
262
263					if (octet & 0xC0) != 0x80 {
264						return yaml_parser_set_reader_error(parser,
265							"invalid trailing UTF-8 octet",
266							parser.offset+k, int(octet))
267					}
268
269					/* Decode the octet. */
270
271					value = (value << 6) + rune(octet&0x3F)
272				}
273
274				/* Check the length of the sequence against the value. */
275				switch {
276				case w == 1:
277				case w == 2 && value >= 0x80:
278				case w == 3 && value >= 0x800:
279				case w == 4 && value >= 0x10000:
280				default:
281					return yaml_parser_set_reader_error(parser,
282						"invalid length of a UTF-8 sequence",
283						parser.offset, -1)
284				}
285
286				/* Check the range of the value. */
287
288				if (value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF {
289					return yaml_parser_set_reader_error(parser,
290						"invalid Unicode character",
291						parser.offset, int(value))
292				}
293			case yaml_UTF16LE_ENCODING,
294				yaml_UTF16BE_ENCODING:
295
296				var low, high int
297				if parser.encoding == yaml_UTF16LE_ENCODING {
298					low, high = 0, 1
299				} else {
300					high, low = 1, 0
301				}
302
303				/*
304				 * The UTF-16 encoding is not as simple as one might
305				 * naively think.  Check RFC 2781
306				 * (http://www.ietf.org/rfc/rfc2781.txt).
307				 *
308				 * Normally, two subsequent bytes describe a Unicode
309				 * character.  However a special technique (called a
310				 * surrogate pair) is used for specifying character
311				 * values larger than 0xFFFF.
312				 *
313				 * A surrogate pair consists of two pseudo-characters:
314				 *      high surrogate area (0xD800-0xDBFF)
315				 *      low surrogate area (0xDC00-0xDFFF)
316				 *
317				 * The following formulas are used for decoding
318				 * and encoding characters using surrogate pairs:
319				 *
320				 *  U  = U' + 0x10000   (0x01 00 00 <= U <= 0x10 FF FF)
321				 *  U' = yyyyyyyyyyxxxxxxxxxx   (0 <= U' <= 0x0F FF FF)
322				 *  W1 = 110110yyyyyyyyyy
323				 *  W2 = 110111xxxxxxxxxx
324				 *
325				 * where U is the character value, W1 is the high surrogate
326				 * area, W2 is the low surrogate area.
327				 */
328
329				/* Check for incomplete UTF-16 character. */
330
331				if raw_unread < 2 {
332					if parser.eof {
333						return yaml_parser_set_reader_error(parser,
334							"incomplete UTF-16 character",
335							parser.offset, -1)
336					}
337					incomplete = true
338					break
339				}
340
341				/* Get the character. */
342				value = rune(parser.raw_buffer[parser.raw_buffer_pos+low]) +
343					(rune(parser.raw_buffer[parser.raw_buffer_pos+high]) << 8)
344
345				/* Check for unexpected low surrogate area. */
346
347				if (value & 0xFC00) == 0xDC00 {
348					return yaml_parser_set_reader_error(parser,
349						"unexpected low surrogate area",
350						parser.offset, int(value))
351				}
352
353				/* Check for a high surrogate area. */
354
355				if (value & 0xFC00) == 0xD800 {
356
357					w = 4
358
359					/* Check for incomplete surrogate pair. */
360
361					if raw_unread < 4 {
362						if parser.eof {
363							return yaml_parser_set_reader_error(parser,
364								"incomplete UTF-16 surrogate pair",
365								parser.offset, -1)
366						}
367						incomplete = true
368						break
369					}
370
371					/* Get the next character. */
372
373					value2 := rune(parser.raw_buffer[parser.raw_buffer_pos+low+2]) +
374						(rune(parser.raw_buffer[parser.raw_buffer_pos+high+2]) << 8)
375
376					/* Check for a low surrogate area. */
377
378					if (value2 & 0xFC00) != 0xDC00 {
379						return yaml_parser_set_reader_error(parser,
380							"expected low surrogate area",
381							parser.offset+2, int(value2))
382					}
383
384					/* Generate the value of the surrogate pair. */
385
386					value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF)
387				} else {
388					w = 2
389				}
390
391				break
392
393			default:
394				panic("Impossible") /* Impossible. */
395			}
396
397			/* Check if the raw buffer contains enough bytes to form a character. */
398
399			if incomplete {
400				break
401			}
402
403			/*
404			 * Check if the character is in the allowed range:
405			 *      #x9 | #xA | #xD | [#x20-#x7E]               (8 bit)
406			 *      | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD]    (16 bit)
407			 *      | [#x10000-#x10FFFF]                        (32 bit)
408			 */
409
410			if !(value == 0x09 || value == 0x0A || value == 0x0D ||
411				(value >= 0x20 && value <= 0x7E) ||
412				(value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) ||
413				(value >= 0xE000 && value <= 0xFFFD) ||
414				(value >= 0x10000 && value <= 0x10FFFF)) {
415				return yaml_parser_set_reader_error(parser,
416					"control characters are not allowed",
417					parser.offset, int(value))
418			}
419
420			/* Move the raw pointers. */
421
422			parser.raw_buffer_pos += w
423			parser.offset += w
424
425			/* Finally put the character into the buffer. */
426
427			/* 0000 0000-0000 007F . 0xxxxxxx */
428			if value <= 0x7F {
429				parser.buffer[buffer_end] = byte(value)
430			} else if value <= 0x7FF {
431				/* 0000 0080-0000 07FF . 110xxxxx 10xxxxxx */
432				parser.buffer[buffer_end] = byte(0xC0 + (value >> 6))
433				parser.buffer[buffer_end+1] = byte(0x80 + (value & 0x3F))
434			} else if value <= 0xFFFF {
435				/* 0000 0800-0000 FFFF . 1110xxxx 10xxxxxx 10xxxxxx */
436				parser.buffer[buffer_end] = byte(0xE0 + (value >> 12))
437				parser.buffer[buffer_end+1] = byte(0x80 + ((value >> 6) & 0x3F))
438				parser.buffer[buffer_end+2] = byte(0x80 + (value & 0x3F))
439			} else {
440				/* 0001 0000-0010 FFFF . 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
441				parser.buffer[buffer_end] = byte(0xF0 + (value >> 18))
442				parser.buffer[buffer_end+1] = byte(0x80 + ((value >> 12) & 0x3F))
443				parser.buffer[buffer_end+2] = byte(0x80 + ((value >> 6) & 0x3F))
444				parser.buffer[buffer_end+3] = byte(0x80 + (value & 0x3F))
445			}
446
447			buffer_end += w
448			parser.unread++
449		}
450
451		/* On EOF, put NUL into the buffer and return. */
452
453		if parser.eof {
454			parser.buffer[buffer_end] = 0
455			buffer_end++
456			parser.buffer = parser.buffer[:buffer_end]
457			parser.unread++
458			return true
459		}
460
461	}
462
463	parser.buffer = parser.buffer[:buffer_end]
464	return true
465}
466