1# 2# Reflects Oren's comments, adds yamlbyte.h at the bottom 3# 4subject: Revision #4 of YAML Bytecodes 5summary: > 6 This proposal defines a 'preparsed' format where a YAML syntax 7 is converted into a series of events, as bytecodes. Each bytecode 8 appears on its own line, starting with a single character and ending 9 with a line feed character, '\n'. 10codes: 11 # 12 # Primary Bytecodes (Capital Letters) 13 # 14 # These bytecodes form the minimum needed to represent YAML information 15 # from the serial model (ie, without format and comments) 16 # 17 'D': 18 name: Document 19 desc: > 20 Indicates that a document has begun, either it is 21 the beginning of a YAML stream, or a --- has been 22 found. Thus, an empty document is expressed 23 as "D\n" 24 'V': 25 name: Directive 26 desc: > 27 This represents any YAML directives immediately following 28 a 'D' bytecode. For example '--- %YAML:1.0' produces the 29 bytecode "D\nVYAML:1.0\n". 30 'P': 31 name: Pause Stream 32 desc: > 33 This is the instruction when a document is terminated, but 34 another document has not yet begun. Thus, it is optional, 35 and typically used to pause parsing. For example, 36 a stream starting with an empty document, but then in a 37 hold state for the next document would be: "D\nP\n" 38 '\z': 39 name: Finish (end stream) 40 desc: > 41 YAML bytecodes are meant to be passable as a single "C" 42 string, and thus the null terminator can optionally be 43 used to signal the end of a stream. When writing bytecodes 44 out to a flat file, the file need not contain a null 45 terminator; however, when read into memory it should 46 always have a null terminator. 47 'M': 48 name: Mapping 49 desc: > 50 Indicates the begin of a mapping, children of the 51 mapping are provided as a series of K1,V1,K2,V2 52 pairs as they are found in the input stream. For 53 example, the bytecodes for "{ a: b, c: d }" would 54 be "M\nSa\nSb\nSc\nSd\nE\n" 55 'Q': 56 name: Sequence 57 desc: > 58 Indicates the begin of a sequence, children are provided 59 following till a '.' bytecode is encountered. So, the 60 bytecodes for "[ one, two ]" would be "Q\nSone\nStwo\nE\n" 61 'E': 62 name: End Collection 63 desc: > 64 This closes the outermost Collection (Mapping, Sequence), 65 note that the document has one and only one node following 66 it, therefore it is not a branch. 67 'S': 68 name: Scalar 69 desc: > 70 This indicates the start of a scalar value, which can 71 be continued by the 'N' and 'C' bytecodes. This bytecode 72 is used for sequence entries, keys, values, etc. 73 'C': 74 name: Scalar Continuation 75 desc: > 76 Since a scalar may not fit within a buffer, and since it 77 may not contain a \n character, it may have to be broken 78 into several chunks. 79 'N': 80 name: Normalized New Line (in a scalar value) 81 desc: > 82 Scalar values must be chunked so that new lines and 83 null values do not occur within a 'S' or 'C' bytecode 84 (in the bytecodes, all other C0 need not be escaped). 85 This bytecode is then used to represent one or more 86 newlines, with the number of newlines optionally 87 following. For example, 88 "Hello\nWorld" would be "SHello\nN\nCWorld\n", and 89 "Hello\n\n\nWorld" is "SHello\nN3\nCWorld\n" 90 91 If the new line is an LS or a PS, the N bytecode can 92 be followed with a L or P. Thus, "Hello\PWorld\L" is 93 reported "SHello\nNP\nWorld\NL\n" 94 95 'Z': 96 name: Null Character (in a scalar value) 97 desc: > 98 As in normalized new lines above, since the null character 99 cannot be used in the bytecodes, is must be escaped, ie, 100 "Hello\zWorld" would be "SHello\nZ\nCWorld\n". 101 'A': 102 name: Alias 103 desc: > 104 This is used when ever there is an alias node, for 105 example, "[ &X one, *X ]" would be normalized 106 to "S\nAX\nSone\nRX\nE\n" -- in this example, the 107 anchor bytecode applies to the very next content 108 bytecode. 109 'R': 110 name: Reference (Anchor) 111 desc: > 112 This bytecode associates an anchor with the very next 113 content node, see the 'A' alias bytecode. 114 'T': 115 name: Transfer 116 desc: > 117 This is the transfer method. If the value begins with 118 a '!', then it is not normalized. Otherwise, the value 119 is a fully qualified URL, with a semicolon. The transfer 120 method applies only to the node immediately following, 121 and thus it can be seen as a modifier like the anchor. 122 For example, "Ttag:yaml.org,2002:str\nSstring\n" is 123 normalized, "T!str\nSstring\n" is not. 124 # 125 # Formatting bytecodes (lower case) 126 # 127 # The following bytecodes are purely at the syntax level and 128 # useful for pretty printers and emitters. Since the range of 129 # lower case letters is contiguous, it could be easy for a 130 # processor to simply ignore all bytecodes in this range. 131 # 132 'c': 133 name: Comment 134 desc: > 135 This is a single line comment. It is terminated like all 136 of the other variable length items, with a '\n'. 137 'i': 138 name: Indent 139 desc: > 140 Specifies number of additional spaces to indent for 141 subsequent block style nodes, "i4\n" specifies 4 char indent. 142 's': 143 name: Scalar styling 144 desc: > 145 This bytecode, is followed with one of the following 146 items to indicate the style to be used for the very 147 next content node. It is an error to specify a style for 148 a scalar other than double quoted when it must be escaped. 149 Furthermore, there must be agreement between the style 150 and the very next content node, in other words, a scalar 151 style requires that the next content node be an S. 152 153 > flow scalar 154 " double quoted scalar 155 ' single quoted scalar 156 | literal scalar 157 p plain scalar 158 { inline mapping 159 [ inline sequence 160 b block style (for mappings and sequences'") 161 162 # 163 # Advanced bytecodes (not alphabetic) 164 # 165 # These are optional goodies which one could find useful. 166 # 167 '#': 168 name: Line Number 169 desc: > 170 This bytecode allows the line number of the very next 171 node to be reported. 172 '!': 173 name: Notice 174 desc: > 175 This is a message sent from the producer to the consumer 176 regarding the state of the stream or document. It does 177 not necessarly end a stream, as the 'finish' bytecode can 178 be used for this purpose. This signal has a packed format, 179 with the error number, a comma, and a textual message: 180 "#22\n!73,Indentation mismatch\n" 181 "#132\n!84,Tabs are illegal for indentation\n" 182 ',': 183 name: Span 184 desc: > 185 This bytecode gives the span of the very next 'S', 'M', 186 or 'Q' bytecode -- including its subordinates. For scalars, 187 it includes the span of all subordinate 'N' and 'C' codes. 188 For mappings or sequences, this gives the length all the 189 way to the corresponding 'E' bytecode so that the entire 190 branch can be skipped. The length is given starting at 191 the corresponding 'S', 'M' or 'Q' bytecode and extends 192 to the first character following subordinate nodes. 193 194 Since this length instruction is meant to be used to 'speed' 195 things up, and since calculating the length via hand is not 196 really ideal, the length is expressed in Hex. This will allow 197 programs to easily convert the length to an actual value 198 (converting from hex to integers is easier than decimal). 199 Furthermore, all leading x's are ignored (so that they can 200 be filled in later) and if the bytecode value is all x's, 201 then the length is unknown. Lastly, this length is expressed 202 in 8 bit units for UTF-8, and 16 bit units for UTF-16. 203 204 For example, 205 --- [[one, two], three] 206 Is expressed as, 207 "?25\nD\n?x1E\nQ\n?xxE\nQ\nSone\nStwo\nE\nSthree\nE\n" 208 209 Thus it is seen that the address of D plus 37 is the null 210 terminator for the string, the first 'Q' plus 30 also 211 gives the null teriminator, and the second 'Q' plus 212 14 jumps to the opening 'S' for the third scalar. 213 '@': 214 name: Allocate 215 desc: > 216 This is a hint telling the processor how many items 217 are in the following collection (mapping pairs, or 218 sequence values), or how many character units need 219 to be allocated to hold the next value. Clearly this 220 is encoding specific value. The length which 221 follows is in hex (not decimal). 222 223 For example, "one", could be "@x3\nSone" 224 225design: 226 - 227 name: streaming support 228 problem: > 229 The interface should ideally allow for a YAML document to be 230 moved incrementally as a stream through a process. In particular, 231 YAML is inheritently line oriented, thus the interface should 232 probably reflect this fundamental character. 233 solution: > 234 The bytecodes deliver scalars as chunks, each chunk limited to 235 at most one line. While this is not ideal for passing large 236 binary objects, it is simple and easy to understand. 237 - 238 name: push 239 problem: > 240 The most common 'parsers' out there for YAML are push style, where 241 the producer owns the 'C' program stack, and the consumer keeps 242 its state as a heap object. Ideal use of a push interface is an 243 emitter, since this allows the sender (the application program) 244 to use the program stack and thus keep its state on the call stack 245 in local, automatic variables. 246 solution: > 247 A push interface simply can call a single event handler with a 248 (bytecode, payload) tuple. Since the core complexity is in the 249 bytecodes, the actual function signature is straight-forward 250 allowing for relative language independence. Since the bytecode 251 is always one character, the event handler could just receive 252 a string where the tuple is implicit. 253 - 254 name: pull 255 problem: > 256 The other alternative for a streaming interface is a 'pull' mechanism, 257 or iterator model where the consumer owns the C stack and the producer 258 keeps any state needed as a heap object. Ideal use of a pull 259 interface is a parser, since this allows the receiver (the application 260 program) to use the program stack, keeping its state on the call stack 261 in local variables. 262 solution: > 263 A pull interface would also be a simple function, that when called 264 filles a buffer with binary node(s). Or, in a language with 265 garbage collection, could be implemented as an iterator returning 266 a string containing the bytecode line (bytecode followed immediately 267 by the bytecode argument as a single string) or as a tuple. 268 - 269 name: pull2push 270 problem: > 271 This is done easily via a small loop which pulls from the 272 iterator and pushes to the event handler. 273 solution: > 274 For python, assuming the parser is implemented as an iterator 275 where one can 'pull' bytecode, args tuples, and assuming the 276 emitter has a event callback taking a bytecode, args tuple, 277 we have: 278 279 def push2pull(parser, emitter): 280 for (bytecode, args) in parser: 281 emitter.push(bytecode, args) 282 283 - 284 name: push2pull 285 problem: > 286 This requires the entire YAML stream be cashed in memory, or 287 each of the two stages in a thread or different continuation 288 with shared memory or pipe between them. 289 solution: > 290 This use case seems much easier with a binary stream; that is, 291 one need not convert the style of functions between the push 292 vs pull pattern. And, for languages supporting continuations, 293 (ruby) perhaps push vs pull is not even an issue... for a 294 language like python, one would use the threaded Queue object, 295 one thread pushes (bytecode, args) tuples into the Queue, while 296 the other thread pulls the tuples out. Simple. 297 - 298 name: neutrality 299 problem: > 300 It would be ideal of the C Program interface was simple enough 301 to be independent of programming language. In an ideal case, 302 imagine a flow of YAML structured data through various processing 303 stages on a server; where each processing stage is written in 304 a different programming language. 305 solution: > 306 While it may be hard for each language to write a syntax parser 307 filled with all of the little details, it would be much much 308 easier to write a parser for these bytecodes; as it involves 309 simple string handling, dispatching on the first character in 310 each string. 311 - 312 name: tools 313 problem: > 314 A goal of mine is to have a YPATH expression language, a schema 315 language, and a transformation language. I would like these items 316 to be reusable by a great number of platforms/languages, and in 317 particular as its own callable processing stage. 318 solution: > 319 If such an expression language was written on top of a bytecode 320 format like this, via a simple pull function (/w adapters for 321 push2pull and pull2push) quite a bit of reusability could emerge. 322 Imagine a schema validator which is injected into the bytecode stream 323 and it is an identity operation unless an exception occurs, in 324 which case, it terminates the document and makes the next document 325 be a description of the validation error. 326 - 327 name: encoding 328 problem: > 329 Text within the bytecode format must be given an encoding. There are 330 several considerations at hand listed below. 331 solution: > 332 The YAML bytecode format uses the same encodings as YAML itself, 333 and thus is independent of actual encoding. A parser library should 334 have several functions to convert between the encodings. 335examples: 336 - 337 yaml: | 338 --- 339 - plain 340 - > 341 this is a flow scalar 342 - > 343 another flow scalar which is continued 344 on a second line and indented 2 spaces 345 - &001 !str | 346 This is a block scalar, both typed 347 and anchored 348 - *001 # this was an alias 349 - "This is a \"double quoted\" scalar" 350 bytecode: | 351 D 352 Q 353 Splain 354 f 355 Sthis is a flow scalar 356 Sanother flow scalar which is continued 357 Con a second line and indented 2 spaces 358 b 359 a001 360 t!str 361 SThis is a block scalar, both typed 362 N 363 Cand anchored 364 R001 365 cthis was an alias 366 d 367 SThis is a "double quoted" scalar 368 E 369cheader: | 370 /* yamlbyte.h 371 * 372 * The YAML bytecode "C" interface header file. See the YAML bytecode 373 * reference for bytecode sequence rules and for the meaning of each 374 * bytecode. 375 */ 376 377 #ifndef YAMLBYTE_H 378 #define YAMLBYTE_H 379 #include <stddef.h> 380 /* list out the various YAML bytecodes */ 381 typedef enum { 382 /* content bytecodes */ 383 YAML_FINISH = 0, 384 YAML_DOCUMENT = 'D', 385 YAML_DIRECTIVE = 'V', 386 YAML_PAUSE = 'P', 387 YAML_MAPPING = 'M', 388 YAML_SEQUENCE = 'S', 389 YAML_ENDMAPSEQ = 'E', 390 YAML_SCALAR = 'S', 391 YAML_CONTINUE = 'C', 392 YAML_NEWLINE = 'N', 393 YAML_NULLCHAR = 'Z', 394 YAML_ALIAS = 'A', 395 YAML_ANCHOR = 'R', 396 YAML_TRANSFER = 'T', 397 /* formatting bytecodes */ 398 YAML_COMMENT = 'c', 399 YAML_INDENT = 'i', 400 YAML_STYLE = 's', 401 /* other bytecodes */ 402 YAML_LINENUMBER = '#', 403 YAML_NOTICE = '!', 404 YAML_SPAN = ',', 405 YAML_ALLOC = '@' 406 } yaml_code_t; 407 408 /* additional modifiers for the YAML_STYLE bytecode */ 409 typedef enum { 410 YAML_FLOW = '>', 411 YAML_LITERAL = '|', 412 YAML_BLOCK = 'b', 413 YAML_PLAIN = 'p', 414 YAML_INLINE_MAPPING = '{', 415 YAML_INLINE_SEQUENCE = '}', 416 YAML_SINGLE_QUOTED = 39, 417 YAML_DOUBLE_QUOTED = '"' 418 } yaml_style_t; 419 420 typedef unsigned char yaml_utf8_t; 421 typedef unsigned short yaml_utf16_t; 422 #ifdef YAML_UTF8 423 #ifdef YAML_UTF16 424 #error Must only define YAML_UTF8 or YAML_UTF16 425 #endif 426 typedef yaml_utf8_t yaml_char_t; 427 #else 428 #ifdef YAML_UTF16 429 typedef yaml_utf16_t yaml_char_t; 430 #else 431 #error Must define YAML_UTF8 or YAML_UTF16 432 #endif 433 #endif 434 435 /* return value for push function, tell parser if you want to stop */ 436 typedef enum { 437 YAML_MORE = 1, /* producer should continue to fire events */ 438 YAML_STOP = 0 /* producer should stop firing events */ 439 } yaml_more_t; 440 441 /* push bytecodes from a producer to a consumer 442 * where arg is null terminated /w a length */ 443 typedef void * yaml_consumer_t; 444 typedef 445 yaml_more_t 446 (*yaml_push_t)( 447 yaml_consumer_t self, 448 yaml_code_t code, 449 const yaml_char_t *arg, 450 size_t arglen 451 ); 452 453 /* pull bytecodes by the producer from the consumer, where 454 * producer must null terminate buff and return the number 455 * of sizeof(yaml_char_t) bytes used */ 456 typedef void * yaml_producer_t; 457 typedef 458 size_t 459 (*yaml_pull_t)( 460 yaml_producer_t self, 461 yaml_code_t *code, 462 yaml_char_t *buff, /* at least 1K buffer */ 463 size_t buffsize 464 ); /* returns number of bytes used in the buffer */ 465 466 /* canonical helper to show how to hook up a parser (as a push 467 * producer) to an emitter (as a push consumer) */ 468 #define YAML_PULL2PUSH(pull, producer, push, consumer) \ 469 do { \ 470 yaml_code_t code = YAML_NOTICE; \ 471 yaml_more_t more = YAML_CONTINUE; \ 472 yaml_char_t buff[1024]; \ 473 size_t size = 0; \ 474 memset(buff, 0, 1024 * sizeof(yaml_char_t)); \ 475 while( code && more) { \ 476 size = (pull)((producer),&code, buff, 1024); \ 477 assert(size < 1024 && !buff[size]); \ 478 more = (push)((consumer),code, buff, size); \ 479 } \ 480 buff[0] = 0; \ 481 (push)((consumer),YAML_FINISH, buff, 0); \ 482 } while(1) 483 484 #endif 485