1#
2# Reflects Oren's comments, adds yamlbyte.h at the bottom
3#
4subject: Revision #4 of YAML Bytecodes
5summary: >
6    This proposal defines a 'preparsed' format where a YAML syntax
7    is converted into a series of events, as bytecodes.   Each bytecode
8    appears on its own line, starting with a single character and ending
9    with a line feed character, '\n'.
10codes:
11  #
12  # Primary Bytecodes  (Capital Letters)
13  #
14  # These bytecodes form the minimum needed to represent YAML information
15  # from the serial model (ie, without format and comments)
16  #
17    'D':
18        name: Document
19        desc: >
20          Indicates that a document has begun, either it is
21          the beginning of a YAML stream, or a --- has been
22          found.   Thus, an empty document is expressed
23          as "D\n"
24    'V':
25        name: Directive
26        desc: >
27          This represents any YAML directives immediately following
28          a 'D' bytecode.  For example '--- %YAML:1.0' produces the
29          bytecode "D\nVYAML:1.0\n".
30    'P':
31        name: Pause Stream
32        desc: >
33          This is the instruction when a document is terminated, but
34          another document has not yet begun.  Thus, it is optional,
35          and typically used to pause parsing.  For example,
36          a stream starting with an empty document, but then in a
37          hold state for the next document would be: "D\nP\n"
38    '\z':
39        name: Finish (end stream)
40        desc: >
41          YAML bytecodes are meant to be passable as a single "C"
42          string, and thus the null terminator can optionally be
43          used to signal the end of a stream.  When writing bytecodes
44          out to a flat file, the file need not contain a null
45          terminator; however, when read into memory it should
46          always have a null terminator.
47    'M':
48        name: Mapping
49        desc: >
50          Indicates the begin of a mapping, children of the
51          mapping are provided as a series of K1,V1,K2,V2
52          pairs as they are found in the input stream.  For
53          example, the bytecodes for "{ a: b, c: d }" would
54          be "M\nSa\nSb\nSc\nSd\nE\n"
55    'Q':
56        name: Sequence
57        desc: >
58          Indicates the begin of a sequence, children are provided
59          following till a '.' bytecode is encountered.  So, the
60          bytecodes for "[ one, two ]" would be "Q\nSone\nStwo\nE\n"
61    'E':
62        name: End Collection
63        desc: >
64          This closes the outermost Collection (Mapping, Sequence),
65          note that the document has one and only one node following
66          it, therefore it is not a branch.
67    'S':
68        name: Scalar
69        desc: >
70          This indicates the start of a scalar value, which can
71          be continued by the 'N' and 'C' bytecodes.   This bytecode
72          is used for sequence entries, keys, values, etc.
73    'C':
74        name: Scalar Continuation
75        desc: >
76          Since a scalar may not fit within a buffer, and since it
77          may not contain a \n character, it may have to be broken
78          into several chunks.
79    'N':
80        name: Normalized New Line (in a scalar value)
81        desc: >
82          Scalar values must be chunked so that new lines and
83          null values do not occur within a 'S' or 'C' bytecode
84          (in the bytecodes, all other C0 need not be escaped).
85          This bytecode is then used to represent one or more
86          newlines, with the number of newlines optionally
87          following.   For example,
88          "Hello\nWorld" would be "SHello\nN\nCWorld\n", and
89          "Hello\n\n\nWorld" is "SHello\nN3\nCWorld\n"
90
91          If the new line is an LS or a PS, the N bytecode can
92          be followed with a L or P.   Thus, "Hello\PWorld\L" is
93          reported "SHello\nNP\nWorld\NL\n"
94
95    'Z':
96        name: Null Character (in a scalar value)
97        desc: >
98          As in normalized new lines above, since the null character
99          cannot be used in the bytecodes, is must be escaped, ie,
100          "Hello\zWorld" would be "SHello\nZ\nCWorld\n".
101    'A':
102        name: Alias
103        desc: >
104          This is used when ever there is an alias node, for
105          example, "[ &X one, *X ]" would be normalized
106          to "S\nAX\nSone\nRX\nE\n" -- in this example, the
107          anchor bytecode applies to the very next content
108          bytecode.
109    'R':
110        name: Reference (Anchor)
111        desc: >
112          This bytecode associates an anchor with the very next
113          content node, see the 'A' alias bytecode.
114    'T':
115        name: Transfer
116        desc: >
117          This is the transfer method.  If the value begins with
118          a '!', then it is not normalized.  Otherwise, the value
119          is a fully qualified URL, with a semicolon.  The transfer
120          method applies only to the node immediately following,
121          and thus it can be seen as a modifier like the anchor.
122          For example, "Ttag:yaml.org,2002:str\nSstring\n" is
123          normalized, "T!str\nSstring\n" is not.
124  #
125  # Formatting bytecodes (lower case)
126  #
127  # The following bytecodes are purely at the syntax level and
128  # useful for pretty printers and emitters.  Since the range of
129  # lower case letters is contiguous, it could be easy for a
130  # processor to simply ignore all bytecodes in this range.
131  #
132    'c':
133        name: Comment
134        desc: >
135          This is a single line comment.  It is terminated like all
136          of the other variable length items, with a '\n'.
137    'i':
138        name: Indent
139        desc: >
140          Specifies number of additional spaces to indent for
141          subsequent block style nodes, "i4\n" specifies 4 char indent.
142    's':
143        name: Scalar styling
144        desc: >
145          This bytecode, is followed with one of the following
146          items to indicate the style to be used for the very
147          next content node.  It is an error to specify a style for
148          a scalar other than double quoted when it must be escaped.
149          Furthermore, there must be agreement between the style
150          and the very next content node, in other words, a scalar
151          style requires that the next content node be an S.
152
153          > flow scalar
154          " double quoted scalar
155          ' single quoted scalar
156          | literal scalar
157          p plain scalar
158          { inline mapping
159          [ inline sequence
160          b block style (for mappings and sequences'")
161
162   #
163   # Advanced bytecodes (not alphabetic)
164   #
165   # These are optional goodies which one could find useful.
166   #
167    '#':
168        name: Line Number
169        desc: >
170          This bytecode allows the line number of the very next
171          node to be reported.
172    '!':
173        name: Notice
174        desc: >
175          This is a message sent from the producer to the consumer
176          regarding the state of the stream or document.  It does
177          not necessarly end a stream, as the 'finish' bytecode can
178          be used for this purpose.  This signal has a packed format,
179          with the error number, a comma, and a textual message:
180              "#22\n!73,Indentation mismatch\n"
181              "#132\n!84,Tabs are illegal for indentation\n"
182    ',':
183        name: Span
184        desc: >
185          This bytecode gives the span of the very next 'S', 'M',
186          or 'Q' bytecode -- including its subordinates.  For scalars,
187          it includes the span of all subordinate 'N' and 'C' codes.
188          For mappings or sequences, this gives the length all the
189          way to the corresponding 'E' bytecode so that the entire
190          branch can be skipped.   The length is given starting at
191          the corresponding 'S', 'M' or 'Q' bytecode and extends
192          to the first character following subordinate nodes.
193
194          Since this length instruction is meant to be used to 'speed'
195          things up, and since calculating the length via hand is not
196          really ideal, the length is expressed in Hex.  This will allow
197          programs to easily convert the length to an actual value
198          (converting from hex to integers is easier than decimal).
199          Furthermore, all leading x's are ignored (so that they can
200          be filled in later) and if the bytecode value is all x's,
201          then the length is unknown.  Lastly, this length is expressed
202          in 8 bit units for UTF-8, and 16 bit units for UTF-16.
203
204          For example,
205             --- [[one, two], three]
206          Is expressed as,
207             "?25\nD\n?x1E\nQ\n?xxE\nQ\nSone\nStwo\nE\nSthree\nE\n"
208
209          Thus it is seen that the address of D plus 37 is the null
210          terminator for the string, the first 'Q' plus 30 also
211          gives the null teriminator, and the second 'Q' plus
212          14 jumps to the opening 'S' for the third scalar.
213    '@':
214        name: Allocate
215        desc: >
216          This is a hint telling the processor how many items
217          are in the following collection (mapping pairs, or
218          sequence values), or how many character units need
219          to be allocated to hold the next value.  Clearly this
220          is encoding specific value.   The length which
221          follows is in hex (not decimal).
222
223          For example, "one", could be  "@x3\nSone"
224
225design:
226  -
227    name: streaming support
228    problem: >
229      The interface should ideally allow for a YAML document to be
230      moved incrementally as a stream through a process.   In particular,
231      YAML is inheritently line oriented, thus the interface should
232      probably reflect this fundamental character.
233    solution: >
234      The bytecodes deliver scalars as chunks, each chunk limited to
235      at most one line.   While this is not ideal for passing large
236      binary objects, it is simple and easy to understand.
237  -
238    name: push
239    problem: >
240      The most common 'parsers' out there for YAML are push style, where
241      the producer owns the 'C' program stack, and the consumer keeps
242      its state as a heap object.  Ideal use of a push interface is an
243      emitter, since this allows the sender (the application program)
244      to use the program stack and thus keep its state on the call stack
245      in local, automatic variables.
246    solution: >
247      A push interface simply can call a single event handler with a
248      (bytecode, payload) tuple.  Since the core complexity is in the
249      bytecodes, the actual function signature is straight-forward
250      allowing for relative language independence.  Since the bytecode
251      is always one character, the event handler could just receive
252      a string where the tuple is implicit.
253  -
254    name: pull
255    problem: >
256      The other alternative for a streaming interface is a 'pull' mechanism,
257      or iterator model where the consumer owns the C stack and the producer
258      keeps any state needed as a heap object.  Ideal use of a pull
259      interface is a parser, since this allows the receiver (the application
260      program) to use the program stack, keeping its state on the call stack
261      in local variables.
262    solution: >
263      A pull interface would also be a simple function, that when called
264      filles a buffer with binary node(s).   Or, in a language with
265      garbage collection, could be implemented as an iterator returning
266      a string containing the bytecode line (bytecode followed immediately
267      by the bytecode argument as a single string) or as a tuple.
268  -
269    name: pull2push
270    problem: >
271      This is done easily via a small loop which pulls from the
272      iterator and pushes to the event handler.
273    solution: >
274      For python, assuming the parser is implemented as an iterator
275      where one can 'pull' bytecode, args tuples, and assuming the
276      emitter has a event callback taking a bytecode, args tuple,
277      we have:
278
279        def push2pull(parser, emitter):
280           for (bytecode, args) in parser:
281               emitter.push(bytecode, args)
282
283  -
284    name: push2pull
285    problem: >
286      This requires the entire YAML stream be cashed in memory, or
287      each of the two stages in a thread or different continuation
288      with shared memory or pipe between them.
289    solution: >
290      This use case seems much easier with a binary stream; that is,
291      one need not convert the style of functions between the push
292      vs pull pattern.   And, for languages supporting continuations,
293      (ruby) perhaps push vs pull is not even an issue...   for a
294      language like python, one would use the threaded Queue object,
295      one thread pushes (bytecode, args) tuples into the Queue, while
296      the other thread pulls the tuples out.  Simple.
297  -
298    name: neutrality
299    problem: >
300      It would be ideal of the C Program interface was simple enough
301      to be independent of programming language.   In an ideal case,
302      imagine a flow of YAML structured data through various processing
303      stages on a server; where each processing stage is written in
304      a different programming language.
305    solution: >
306      While it may be hard for each language to write a syntax parser
307      filled with all of the little details, it would be much much
308      easier to write a parser for these bytecodes; as it involves
309      simple string handling, dispatching on the first character in
310      each string.
311  -
312    name: tools
313    problem: >
314      A goal of mine is to have a YPATH expression language, a schema
315      language, and a transformation language.   I would like these items
316      to be reusable by a great number of platforms/languages, and in
317      particular as its own callable processing stage.
318    solution: >
319      If such an expression language was written on top of a bytecode
320      format like this, via a simple pull function (/w adapters for
321      push2pull and pull2push) quite a bit of reusability could emerge.
322      Imagine a schema validator which is injected into the bytecode stream
323      and it is an identity operation unless an exception occurs, in
324      which case, it terminates the document and makes the next document
325      be a description of the validation error.
326  -
327    name: encoding
328    problem: >
329      Text within the bytecode format must be given an encoding.  There are
330      several considerations at hand listed below.
331    solution: >
332      The YAML bytecode format uses the same encodings as YAML itself,
333      and thus is independent of actual encoding.  A parser library should
334      have several functions to convert between the encodings.
335examples:
336  -
337    yaml: |
338      ---
339      - plain
340      - >
341        this is a flow scalar
342      - >
343        another flow scalar which is continued
344        on a second line and indented 2 spaces
345      - &001 !str |
346        This is a block scalar, both typed
347        and anchored
348      - *001 # this was an alias
349      - "This is a \"double quoted\" scalar"
350    bytecode: |
351      D
352      Q
353      Splain
354      f
355      Sthis is a flow scalar
356      Sanother flow scalar which is continued
357      Con a second line and indented 2 spaces
358      b
359      a001
360      t!str
361      SThis is a block scalar, both typed
362      N
363      Cand anchored
364      R001
365      cthis was an alias
366      d
367      SThis is a "double quoted" scalar
368      E
369cheader: |
370    /*  yamlbyte.h
371     *
372     *  The YAML bytecode "C" interface header file.   See the YAML bytecode
373     *  reference for bytecode sequence rules and for the meaning of each
374     *  bytecode.
375     */
376
377    #ifndef YAMLBYTE_H
378    #define YAMLBYTE_H
379    #include <stddef.h>
380    /* list out the various YAML bytecodes */
381    typedef enum {
382        /* content bytecodes */
383        YAML_FINISH    = 0,
384        YAML_DOCUMENT  = 'D',
385        YAML_DIRECTIVE = 'V',
386        YAML_PAUSE     = 'P',
387        YAML_MAPPING   = 'M',
388        YAML_SEQUENCE  = 'S',
389        YAML_ENDMAPSEQ = 'E',
390        YAML_SCALAR    = 'S',
391        YAML_CONTINUE  = 'C',
392        YAML_NEWLINE   = 'N',
393        YAML_NULLCHAR  = 'Z',
394        YAML_ALIAS     = 'A',
395        YAML_ANCHOR    = 'R',
396        YAML_TRANSFER  = 'T',
397        /* formatting bytecodes */
398        YAML_COMMENT = 'c',
399        YAML_INDENT  = 'i',
400        YAML_STYLE   = 's',
401        /* other bytecodes */
402        YAML_LINENUMBER = '#',
403        YAML_NOTICE = '!',
404        YAML_SPAN   = ',',
405        YAML_ALLOC  = '@'
406    } yaml_code_t;
407
408    /* additional modifiers for the YAML_STYLE bytecode */
409    typedef enum {
410       YAML_FLOW = '>',
411       YAML_LITERAL = '|',
412       YAML_BLOCK = 'b',
413       YAML_PLAIN = 'p',
414       YAML_INLINE_MAPPING = '{',
415       YAML_INLINE_SEQUENCE = '}',
416       YAML_SINGLE_QUOTED = 39,
417       YAML_DOUBLE_QUOTED = '"'
418    } yaml_style_t;
419
420    typedef unsigned char yaml_utf8_t;
421    typedef unsigned short yaml_utf16_t;
422    #ifdef YAML_UTF8
423      #ifdef YAML_UTF16
424        #error Must only define YAML_UTF8 or YAML_UTF16
425      #endif
426      typedef yaml_utf8_t yaml_char_t;
427    #else
428      #ifdef YAML_UTF16
429        typedef yaml_utf16_t yaml_char_t;
430      #else
431        #error Must define YAML_UTF8 or YAML_UTF16
432      #endif
433    #endif
434
435    /* return value for push function, tell parser if you want to stop */
436    typedef enum {
437        YAML_MORE = 1, /* producer should continue to fire events */
438        YAML_STOP = 0  /* producer should stop firing events      */
439    } yaml_more_t;
440
441    /* push bytecodes from a producer to a consumer
442     * where arg is null terminated /w a length */
443    typedef void * yaml_consumer_t;
444    typedef
445      yaml_more_t
446       (*yaml_push_t)(
447         yaml_consumer_t self,
448         yaml_code_t code,
449         const yaml_char_t *arg,
450         size_t arglen
451       );
452
453    /* pull bytecodes by the producer from the consumer, where
454     * producer must null terminate buff and return the number
455     * of sizeof(yaml_char_t) bytes used */
456    typedef void * yaml_producer_t;
457    typedef
458      size_t
459        (*yaml_pull_t)(
460          yaml_producer_t self,
461          yaml_code_t *code,
462          yaml_char_t *buff,     /* at least 1K buffer */
463          size_t buffsize
464        );  /* returns number of bytes used in the buffer */
465
466    /* canonical helper to show how to hook up a parser (as a push
467     * producer) to an emitter (as a push consumer)  */
468    #define YAML_PULL2PUSH(pull, producer, push, consumer)      \
469      do {                                                      \
470          yaml_code_t code = YAML_NOTICE;                       \
471          yaml_more_t more = YAML_CONTINUE;                     \
472          yaml_char_t buff[1024];                               \
473          size_t      size = 0;                                 \
474          memset(buff, 0, 1024 * sizeof(yaml_char_t));          \
475          while( code && more) {                                \
476              size = (pull)((producer),&code, buff, 1024);      \
477              assert(size < 1024 && !buff[size]);               \
478              more = (push)((consumer),code, buff, size);       \
479          }                                                     \
480          buff[0] = 0;                                          \
481          (push)((consumer),YAML_FINISH, buff, 0);              \
482      } while(1)
483
484    #endif
485