1 /*
2 ** protobuf decoder bytecode compiler
3 **
4 ** Code to compile a upb::Handlers into bytecode for decoding a protobuf
5 ** according to that specific schema and destination handlers.
6 **
7 ** Bytecode definition is in decoder.int.h.
8 */
9 
10 #include <stdarg.h>
11 #include "upb/pb/decoder.int.h"
12 #include "upb/pb/varint.int.h"
13 
14 #ifdef UPB_DUMP_BYTECODE
15 #include <stdio.h>
16 #endif
17 
18 #include "upb/port_def.inc"
19 
20 #define MAXLABEL 5
21 #define EMPTYLABEL -1
22 
23 /* upb_pbdecodermethod ********************************************************/
24 
freemethod(upb_pbdecodermethod * method)25 static void freemethod(upb_pbdecodermethod *method) {
26   upb_inttable_uninit(&method->dispatch);
27   upb_gfree(method);
28 }
29 
newmethod(const upb_handlers * dest_handlers,mgroup * group)30 static upb_pbdecodermethod *newmethod(const upb_handlers *dest_handlers,
31                                       mgroup *group) {
32   upb_pbdecodermethod *ret = upb_gmalloc(sizeof(*ret));
33   upb_byteshandler_init(&ret->input_handler_);
34 
35   ret->group = group;
36   ret->dest_handlers_ = dest_handlers;
37   upb_inttable_init(&ret->dispatch, UPB_CTYPE_UINT64);
38 
39   return ret;
40 }
41 
upb_pbdecodermethod_desthandlers(const upb_pbdecodermethod * m)42 const upb_handlers *upb_pbdecodermethod_desthandlers(
43     const upb_pbdecodermethod *m) {
44   return m->dest_handlers_;
45 }
46 
upb_pbdecodermethod_inputhandler(const upb_pbdecodermethod * m)47 const upb_byteshandler *upb_pbdecodermethod_inputhandler(
48     const upb_pbdecodermethod *m) {
49   return &m->input_handler_;
50 }
51 
upb_pbdecodermethod_isnative(const upb_pbdecodermethod * m)52 bool upb_pbdecodermethod_isnative(const upb_pbdecodermethod *m) {
53   return m->is_native_;
54 }
55 
56 
57 /* mgroup *********************************************************************/
58 
freegroup(mgroup * g)59 static void freegroup(mgroup *g) {
60   upb_inttable_iter i;
61 
62   upb_inttable_begin(&i, &g->methods);
63   for(; !upb_inttable_done(&i); upb_inttable_next(&i)) {
64     freemethod(upb_value_getptr(upb_inttable_iter_value(&i)));
65   }
66 
67   upb_inttable_uninit(&g->methods);
68   upb_gfree(g->bytecode);
69   upb_gfree(g);
70 }
71 
newgroup(void)72 mgroup *newgroup(void) {
73   mgroup *g = upb_gmalloc(sizeof(*g));
74   upb_inttable_init(&g->methods, UPB_CTYPE_PTR);
75   g->bytecode = NULL;
76   g->bytecode_end = NULL;
77   return g;
78 }
79 
80 
81 /* bytecode compiler **********************************************************/
82 
83 /* Data used only at compilation time. */
84 typedef struct {
85   mgroup *group;
86 
87   uint32_t *pc;
88   int fwd_labels[MAXLABEL];
89   int back_labels[MAXLABEL];
90 
91   /* For fields marked "lazy", parse them lazily or eagerly? */
92   bool lazy;
93 } compiler;
94 
newcompiler(mgroup * group,bool lazy)95 static compiler *newcompiler(mgroup *group, bool lazy) {
96   compiler *ret = upb_gmalloc(sizeof(*ret));
97   int i;
98 
99   ret->group = group;
100   ret->lazy = lazy;
101   for (i = 0; i < MAXLABEL; i++) {
102     ret->fwd_labels[i] = EMPTYLABEL;
103     ret->back_labels[i] = EMPTYLABEL;
104   }
105   return ret;
106 }
107 
freecompiler(compiler * c)108 static void freecompiler(compiler *c) {
109   upb_gfree(c);
110 }
111 
112 const size_t ptr_words = sizeof(void*) / sizeof(uint32_t);
113 
114 /* How many words an instruction is. */
instruction_len(uint32_t instr)115 static int instruction_len(uint32_t instr) {
116   switch (getop(instr)) {
117     case OP_SETDISPATCH: return 1 + ptr_words;
118     case OP_TAGN: return 3;
119     case OP_SETBIGGROUPNUM: return 2;
120     default: return 1;
121   }
122 }
123 
op_has_longofs(int32_t instruction)124 bool op_has_longofs(int32_t instruction) {
125   switch (getop(instruction)) {
126     case OP_CALL:
127     case OP_BRANCH:
128     case OP_CHECKDELIM:
129       return true;
130     /* The "tag" instructions only have 8 bytes available for the jump target,
131      * but that is ok because these opcodes only require short jumps. */
132     case OP_TAG1:
133     case OP_TAG2:
134     case OP_TAGN:
135       return false;
136     default:
137       UPB_ASSERT(false);
138       return false;
139   }
140 }
141 
getofs(uint32_t instruction)142 static int32_t getofs(uint32_t instruction) {
143   if (op_has_longofs(instruction)) {
144     return (int32_t)instruction >> 8;
145   } else {
146     return (int8_t)(instruction >> 8);
147   }
148 }
149 
setofs(uint32_t * instruction,int32_t ofs)150 static void setofs(uint32_t *instruction, int32_t ofs) {
151   if (op_has_longofs(*instruction)) {
152     *instruction = getop(*instruction) | (uint32_t)ofs << 8;
153   } else {
154     *instruction = (*instruction & ~0xff00) | ((ofs & 0xff) << 8);
155   }
156   UPB_ASSERT(getofs(*instruction) == ofs);  /* Would fail in cases of overflow. */
157 }
158 
pcofs(compiler * c)159 static uint32_t pcofs(compiler *c) {
160   return (uint32_t)(c->pc - c->group->bytecode);
161 }
162 
163 /* Defines a local label at the current PC location.  All previous forward
164  * references are updated to point to this location.  The location is noted
165  * for any future backward references. */
label(compiler * c,unsigned int label)166 static void label(compiler *c, unsigned int label) {
167   int val;
168   uint32_t *codep;
169 
170   UPB_ASSERT(label < MAXLABEL);
171   val = c->fwd_labels[label];
172   codep = (val == EMPTYLABEL) ? NULL : c->group->bytecode + val;
173   while (codep) {
174     int ofs = getofs(*codep);
175     setofs(codep, (int32_t)(c->pc - codep - instruction_len(*codep)));
176     codep = ofs ? codep + ofs : NULL;
177   }
178   c->fwd_labels[label] = EMPTYLABEL;
179   c->back_labels[label] = pcofs(c);
180 }
181 
182 /* Creates a reference to a numbered label; either a forward reference
183  * (positive arg) or backward reference (negative arg).  For forward references
184  * the value returned now is actually a "next" pointer into a linked list of all
185  * instructions that use this label and will be patched later when the label is
186  * defined with label().
187  *
188  * The returned value is the offset that should be written into the instruction.
189  */
labelref(compiler * c,int label)190 static int32_t labelref(compiler *c, int label) {
191   UPB_ASSERT(label < MAXLABEL);
192   if (label == LABEL_DISPATCH) {
193     /* No resolving required. */
194     return 0;
195   } else if (label < 0) {
196     /* Backward local label.  Relative to the next instruction. */
197     uint32_t from = (uint32_t)((c->pc + 1) - c->group->bytecode);
198     return c->back_labels[-label] - from;
199   } else {
200     /* Forward local label: prepend to (possibly-empty) linked list. */
201     int *lptr = &c->fwd_labels[label];
202     int32_t ret = (*lptr == EMPTYLABEL) ? 0 : *lptr - pcofs(c);
203     *lptr = pcofs(c);
204     return ret;
205   }
206 }
207 
put32(compiler * c,uint32_t v)208 static void put32(compiler *c, uint32_t v) {
209   mgroup *g = c->group;
210   if (c->pc == g->bytecode_end) {
211     int ofs = pcofs(c);
212     size_t oldsize = g->bytecode_end - g->bytecode;
213     size_t newsize = UPB_MAX(oldsize * 2, 64);
214     /* TODO(haberman): handle OOM. */
215     g->bytecode = upb_grealloc(g->bytecode, oldsize * sizeof(uint32_t),
216                                             newsize * sizeof(uint32_t));
217     g->bytecode_end = g->bytecode + newsize;
218     c->pc = g->bytecode + ofs;
219   }
220   *c->pc++ = v;
221 }
222 
putop(compiler * c,int op,...)223 static void putop(compiler *c, int op, ...) {
224   va_list ap;
225   va_start(ap, op);
226 
227   switch (op) {
228     case OP_SETDISPATCH: {
229       uintptr_t ptr = (uintptr_t)va_arg(ap, void*);
230       put32(c, OP_SETDISPATCH);
231       put32(c, (uint32_t)ptr);
232       if (sizeof(uintptr_t) > sizeof(uint32_t))
233         put32(c, (uint64_t)ptr >> 32);
234       break;
235     }
236     case OP_STARTMSG:
237     case OP_ENDMSG:
238     case OP_PUSHLENDELIM:
239     case OP_POP:
240     case OP_SETDELIM:
241     case OP_HALT:
242     case OP_RET:
243     case OP_DISPATCH:
244       put32(c, op);
245       break;
246     case OP_PARSE_DOUBLE:
247     case OP_PARSE_FLOAT:
248     case OP_PARSE_INT64:
249     case OP_PARSE_UINT64:
250     case OP_PARSE_INT32:
251     case OP_PARSE_FIXED64:
252     case OP_PARSE_FIXED32:
253     case OP_PARSE_BOOL:
254     case OP_PARSE_UINT32:
255     case OP_PARSE_SFIXED32:
256     case OP_PARSE_SFIXED64:
257     case OP_PARSE_SINT32:
258     case OP_PARSE_SINT64:
259     case OP_STARTSEQ:
260     case OP_ENDSEQ:
261     case OP_STARTSUBMSG:
262     case OP_ENDSUBMSG:
263     case OP_STARTSTR:
264     case OP_STRING:
265     case OP_ENDSTR:
266     case OP_PUSHTAGDELIM:
267       put32(c, op | va_arg(ap, upb_selector_t) << 8);
268       break;
269     case OP_SETBIGGROUPNUM:
270       put32(c, op);
271       put32(c, va_arg(ap, int));
272       break;
273     case OP_CALL: {
274       const upb_pbdecodermethod *method = va_arg(ap, upb_pbdecodermethod *);
275       put32(c, op | (method->code_base.ofs - (pcofs(c) + 1)) << 8);
276       break;
277     }
278     case OP_CHECKDELIM:
279     case OP_BRANCH: {
280       uint32_t instruction = op;
281       int label = va_arg(ap, int);
282       setofs(&instruction, labelref(c, label));
283       put32(c, instruction);
284       break;
285     }
286     case OP_TAG1:
287     case OP_TAG2: {
288       int label = va_arg(ap, int);
289       uint64_t tag = va_arg(ap, uint64_t);
290       uint32_t instruction = (uint32_t)(op | (tag << 16));
291       UPB_ASSERT(tag <= 0xffff);
292       setofs(&instruction, labelref(c, label));
293       put32(c, instruction);
294       break;
295     }
296     case OP_TAGN: {
297       int label = va_arg(ap, int);
298       uint64_t tag = va_arg(ap, uint64_t);
299       uint32_t instruction = op | (upb_value_size(tag) << 16);
300       setofs(&instruction, labelref(c, label));
301       put32(c, instruction);
302       put32(c, (uint32_t)tag);
303       put32(c, tag >> 32);
304       break;
305     }
306   }
307 
308   va_end(ap);
309 }
310 
311 #if defined(UPB_DUMP_BYTECODE)
312 
upb_pbdecoder_getopname(unsigned int op)313 const char *upb_pbdecoder_getopname(unsigned int op) {
314 #define QUOTE(x) #x
315 #define EXPAND_AND_QUOTE(x) QUOTE(x)
316 #define OPNAME(x) OP_##x
317 #define OP(x) case OPNAME(x): return EXPAND_AND_QUOTE(OPNAME(x));
318 #define T(x) OP(PARSE_##x)
319   /* Keep in sync with list in decoder.int.h. */
320   switch ((opcode)op) {
321     T(DOUBLE) T(FLOAT) T(INT64) T(UINT64) T(INT32) T(FIXED64) T(FIXED32)
322     T(BOOL) T(UINT32) T(SFIXED32) T(SFIXED64) T(SINT32) T(SINT64)
323     OP(STARTMSG) OP(ENDMSG) OP(STARTSEQ) OP(ENDSEQ) OP(STARTSUBMSG)
324     OP(ENDSUBMSG) OP(STARTSTR) OP(STRING) OP(ENDSTR) OP(CALL) OP(RET)
325     OP(PUSHLENDELIM) OP(PUSHTAGDELIM) OP(SETDELIM) OP(CHECKDELIM)
326     OP(BRANCH) OP(TAG1) OP(TAG2) OP(TAGN) OP(SETDISPATCH) OP(POP)
327     OP(SETBIGGROUPNUM) OP(DISPATCH) OP(HALT)
328   }
329   return "<unknown op>";
330 #undef OP
331 #undef T
332 }
333 
334 #endif
335 
336 #ifdef UPB_DUMP_BYTECODE
337 
dumpbc(uint32_t * p,uint32_t * end,FILE * f)338 static void dumpbc(uint32_t *p, uint32_t *end, FILE *f) {
339 
340   uint32_t *begin = p;
341 
342   while (p < end) {
343     fprintf(f, "%p  %8tx", p, p - begin);
344     uint32_t instr = *p++;
345     uint8_t op = getop(instr);
346     fprintf(f, " %s", upb_pbdecoder_getopname(op));
347     switch ((opcode)op) {
348       case OP_SETDISPATCH: {
349         const upb_inttable *dispatch;
350         memcpy(&dispatch, p, sizeof(void*));
351         p += ptr_words;
352         const upb_pbdecodermethod *method =
353             (void *)((char *)dispatch -
354                      offsetof(upb_pbdecodermethod, dispatch));
355         fprintf(f, " %s", upb_msgdef_fullname(
356                               upb_handlers_msgdef(method->dest_handlers_)));
357         break;
358       }
359       case OP_DISPATCH:
360       case OP_STARTMSG:
361       case OP_ENDMSG:
362       case OP_PUSHLENDELIM:
363       case OP_POP:
364       case OP_SETDELIM:
365       case OP_HALT:
366       case OP_RET:
367         break;
368       case OP_PARSE_DOUBLE:
369       case OP_PARSE_FLOAT:
370       case OP_PARSE_INT64:
371       case OP_PARSE_UINT64:
372       case OP_PARSE_INT32:
373       case OP_PARSE_FIXED64:
374       case OP_PARSE_FIXED32:
375       case OP_PARSE_BOOL:
376       case OP_PARSE_UINT32:
377       case OP_PARSE_SFIXED32:
378       case OP_PARSE_SFIXED64:
379       case OP_PARSE_SINT32:
380       case OP_PARSE_SINT64:
381       case OP_STARTSEQ:
382       case OP_ENDSEQ:
383       case OP_STARTSUBMSG:
384       case OP_ENDSUBMSG:
385       case OP_STARTSTR:
386       case OP_STRING:
387       case OP_ENDSTR:
388       case OP_PUSHTAGDELIM:
389         fprintf(f, " %d", instr >> 8);
390         break;
391       case OP_SETBIGGROUPNUM:
392         fprintf(f, " %d", *p++);
393         break;
394       case OP_CHECKDELIM:
395       case OP_CALL:
396       case OP_BRANCH:
397         fprintf(f, " =>0x%tx", p + getofs(instr) - begin);
398         break;
399       case OP_TAG1:
400       case OP_TAG2: {
401         fprintf(f, " tag:0x%x", instr >> 16);
402         if (getofs(instr)) {
403           fprintf(f, " =>0x%tx", p + getofs(instr) - begin);
404         }
405         break;
406       }
407       case OP_TAGN: {
408         uint64_t tag = *p++;
409         tag |= (uint64_t)*p++ << 32;
410         fprintf(f, " tag:0x%llx", (long long)tag);
411         fprintf(f, " n:%d", instr >> 16);
412         if (getofs(instr)) {
413           fprintf(f, " =>0x%tx", p + getofs(instr) - begin);
414         }
415         break;
416       }
417     }
418     fputs("\n", f);
419   }
420 }
421 
422 #endif
423 
get_encoded_tag(const upb_fielddef * f,int wire_type)424 static uint64_t get_encoded_tag(const upb_fielddef *f, int wire_type) {
425   uint32_t tag = (upb_fielddef_number(f) << 3) | wire_type;
426   uint64_t encoded_tag = upb_vencode32(tag);
427   /* No tag should be greater than 5 bytes. */
428   UPB_ASSERT(encoded_tag <= 0xffffffffff);
429   return encoded_tag;
430 }
431 
putchecktag(compiler * c,const upb_fielddef * f,int wire_type,int dest)432 static void putchecktag(compiler *c, const upb_fielddef *f,
433                         int wire_type, int dest) {
434   uint64_t tag = get_encoded_tag(f, wire_type);
435   switch (upb_value_size(tag)) {
436     case 1:
437       putop(c, OP_TAG1, dest, tag);
438       break;
439     case 2:
440       putop(c, OP_TAG2, dest, tag);
441       break;
442     default:
443       putop(c, OP_TAGN, dest, tag);
444       break;
445   }
446 }
447 
getsel(const upb_fielddef * f,upb_handlertype_t type)448 static upb_selector_t getsel(const upb_fielddef *f, upb_handlertype_t type) {
449   upb_selector_t selector;
450   bool ok = upb_handlers_getselector(f, type, &selector);
451   UPB_ASSERT(ok);
452   return selector;
453 }
454 
455 /* Takes an existing, primary dispatch table entry and repacks it with a
456  * different alternate wire type.  Called when we are inserting a secondary
457  * dispatch table entry for an alternate wire type. */
repack(uint64_t dispatch,int new_wt2)458 static uint64_t repack(uint64_t dispatch, int new_wt2) {
459   uint64_t ofs;
460   uint8_t wt1;
461   uint8_t old_wt2;
462   upb_pbdecoder_unpackdispatch(dispatch, &ofs, &wt1, &old_wt2);
463   UPB_ASSERT(old_wt2 == NO_WIRE_TYPE);  /* wt2 should not be set yet. */
464   return upb_pbdecoder_packdispatch(ofs, wt1, new_wt2);
465 }
466 
467 /* Marks the current bytecode position as the dispatch target for this message,
468  * field, and wire type. */
dispatchtarget(compiler * c,upb_pbdecodermethod * method,const upb_fielddef * f,int wire_type)469 static void dispatchtarget(compiler *c, upb_pbdecodermethod *method,
470                            const upb_fielddef *f, int wire_type) {
471   /* Offset is relative to msg base. */
472   uint64_t ofs = pcofs(c) - method->code_base.ofs;
473   uint32_t fn = upb_fielddef_number(f);
474   upb_inttable *d = &method->dispatch;
475   upb_value v;
476   if (upb_inttable_remove(d, fn, &v)) {
477     /* TODO: prioritize based on packed setting in .proto file. */
478     uint64_t repacked = repack(upb_value_getuint64(v), wire_type);
479     upb_inttable_insert(d, fn, upb_value_uint64(repacked));
480     upb_inttable_insert(d, fn + UPB_MAX_FIELDNUMBER, upb_value_uint64(ofs));
481   } else {
482     uint64_t val = upb_pbdecoder_packdispatch(ofs, wire_type, NO_WIRE_TYPE);
483     upb_inttable_insert(d, fn, upb_value_uint64(val));
484   }
485 }
486 
putpush(compiler * c,const upb_fielddef * f)487 static void putpush(compiler *c, const upb_fielddef *f) {
488   if (upb_fielddef_descriptortype(f) == UPB_DESCRIPTOR_TYPE_MESSAGE) {
489     putop(c, OP_PUSHLENDELIM);
490   } else {
491     uint32_t fn = upb_fielddef_number(f);
492     if (fn >= 1 << 24) {
493       putop(c, OP_PUSHTAGDELIM, 0);
494       putop(c, OP_SETBIGGROUPNUM, fn);
495     } else {
496       putop(c, OP_PUSHTAGDELIM, fn);
497     }
498   }
499 }
500 
find_submethod(const compiler * c,const upb_pbdecodermethod * method,const upb_fielddef * f)501 static upb_pbdecodermethod *find_submethod(const compiler *c,
502                                            const upb_pbdecodermethod *method,
503                                            const upb_fielddef *f) {
504   const upb_handlers *sub =
505       upb_handlers_getsubhandlers(method->dest_handlers_, f);
506   upb_value v;
507   return upb_inttable_lookupptr(&c->group->methods, sub, &v)
508              ? upb_value_getptr(v)
509              : NULL;
510 }
511 
putsel(compiler * c,opcode op,upb_selector_t sel,const upb_handlers * h)512 static void putsel(compiler *c, opcode op, upb_selector_t sel,
513                    const upb_handlers *h) {
514   if (upb_handlers_gethandler(h, sel, NULL)) {
515     putop(c, op, sel);
516   }
517 }
518 
519 /* Puts an opcode to call a callback, but only if a callback actually exists for
520  * this field and handler type. */
maybeput(compiler * c,opcode op,const upb_handlers * h,const upb_fielddef * f,upb_handlertype_t type)521 static void maybeput(compiler *c, opcode op, const upb_handlers *h,
522                      const upb_fielddef *f, upb_handlertype_t type) {
523   putsel(c, op, getsel(f, type), h);
524 }
525 
haslazyhandlers(const upb_handlers * h,const upb_fielddef * f)526 static bool haslazyhandlers(const upb_handlers *h, const upb_fielddef *f) {
527   if (!upb_fielddef_lazy(f))
528     return false;
529 
530   return upb_handlers_gethandler(h, getsel(f, UPB_HANDLER_STARTSTR), NULL) ||
531          upb_handlers_gethandler(h, getsel(f, UPB_HANDLER_STRING), NULL) ||
532          upb_handlers_gethandler(h, getsel(f, UPB_HANDLER_ENDSTR), NULL);
533 }
534 
535 
536 /* bytecode compiler code generation ******************************************/
537 
538 /* Symbolic names for our local labels. */
539 #define LABEL_LOOPSTART 1  /* Top of a repeated field loop. */
540 #define LABEL_LOOPBREAK 2  /* To jump out of a repeated loop */
541 #define LABEL_FIELD     3  /* Jump backward to find the most recent field. */
542 #define LABEL_ENDMSG    4  /* To reach the OP_ENDMSG instr for this msg. */
543 
544 /* Generates bytecode to parse a single non-lazy message field. */
generate_msgfield(compiler * c,const upb_fielddef * f,upb_pbdecodermethod * method)545 static void generate_msgfield(compiler *c, const upb_fielddef *f,
546                               upb_pbdecodermethod *method) {
547   const upb_handlers *h = upb_pbdecodermethod_desthandlers(method);
548   const upb_pbdecodermethod *sub_m = find_submethod(c, method, f);
549   int wire_type;
550 
551   if (!sub_m) {
552     /* Don't emit any code for this field at all; it will be parsed as an
553      * unknown field.
554      *
555      * TODO(haberman): we should change this to parse it as a string field
556      * instead.  It will probably be faster, but more importantly, once we
557      * start vending unknown fields, a field shouldn't be treated as unknown
558      * just because it doesn't have subhandlers registered. */
559     return;
560   }
561 
562   label(c, LABEL_FIELD);
563 
564   wire_type =
565       (upb_fielddef_descriptortype(f) == UPB_DESCRIPTOR_TYPE_MESSAGE)
566           ? UPB_WIRE_TYPE_DELIMITED
567           : UPB_WIRE_TYPE_START_GROUP;
568 
569   if (upb_fielddef_isseq(f)) {
570     putop(c, OP_CHECKDELIM, LABEL_ENDMSG);
571     putchecktag(c, f, wire_type, LABEL_DISPATCH);
572    dispatchtarget(c, method, f, wire_type);
573     putop(c, OP_PUSHTAGDELIM, 0);
574     putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ));
575    label(c, LABEL_LOOPSTART);
576     putpush(c, f);
577     putop(c, OP_STARTSUBMSG, getsel(f, UPB_HANDLER_STARTSUBMSG));
578     putop(c, OP_CALL, sub_m);
579     putop(c, OP_POP);
580     maybeput(c, OP_ENDSUBMSG, h, f, UPB_HANDLER_ENDSUBMSG);
581     if (wire_type == UPB_WIRE_TYPE_DELIMITED) {
582       putop(c, OP_SETDELIM);
583     }
584     putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK);
585     putchecktag(c, f, wire_type, LABEL_LOOPBREAK);
586     putop(c, OP_BRANCH, -LABEL_LOOPSTART);
587    label(c, LABEL_LOOPBREAK);
588     putop(c, OP_POP);
589     maybeput(c, OP_ENDSEQ, h, f, UPB_HANDLER_ENDSEQ);
590   } else {
591     putop(c, OP_CHECKDELIM, LABEL_ENDMSG);
592     putchecktag(c, f, wire_type, LABEL_DISPATCH);
593    dispatchtarget(c, method, f, wire_type);
594     putpush(c, f);
595     putop(c, OP_STARTSUBMSG, getsel(f, UPB_HANDLER_STARTSUBMSG));
596     putop(c, OP_CALL, sub_m);
597     putop(c, OP_POP);
598     maybeput(c, OP_ENDSUBMSG, h, f, UPB_HANDLER_ENDSUBMSG);
599     if (wire_type == UPB_WIRE_TYPE_DELIMITED) {
600       putop(c, OP_SETDELIM);
601     }
602   }
603 }
604 
605 /* Generates bytecode to parse a single string or lazy submessage field. */
generate_delimfield(compiler * c,const upb_fielddef * f,upb_pbdecodermethod * method)606 static void generate_delimfield(compiler *c, const upb_fielddef *f,
607                                 upb_pbdecodermethod *method) {
608   const upb_handlers *h = upb_pbdecodermethod_desthandlers(method);
609 
610   label(c, LABEL_FIELD);
611   if (upb_fielddef_isseq(f)) {
612     putop(c, OP_CHECKDELIM, LABEL_ENDMSG);
613     putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_DISPATCH);
614    dispatchtarget(c, method, f, UPB_WIRE_TYPE_DELIMITED);
615     putop(c, OP_PUSHTAGDELIM, 0);
616     putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ));
617    label(c, LABEL_LOOPSTART);
618     putop(c, OP_PUSHLENDELIM);
619     putop(c, OP_STARTSTR, getsel(f, UPB_HANDLER_STARTSTR));
620     /* Need to emit even if no handler to skip past the string. */
621     putop(c, OP_STRING, getsel(f, UPB_HANDLER_STRING));
622     maybeput(c, OP_ENDSTR, h, f, UPB_HANDLER_ENDSTR);
623     putop(c, OP_POP);
624     putop(c, OP_SETDELIM);
625     putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK);
626     putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_LOOPBREAK);
627     putop(c, OP_BRANCH, -LABEL_LOOPSTART);
628    label(c, LABEL_LOOPBREAK);
629     putop(c, OP_POP);
630     maybeput(c, OP_ENDSEQ, h, f, UPB_HANDLER_ENDSEQ);
631   } else {
632     putop(c, OP_CHECKDELIM, LABEL_ENDMSG);
633     putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_DISPATCH);
634    dispatchtarget(c, method, f, UPB_WIRE_TYPE_DELIMITED);
635     putop(c, OP_PUSHLENDELIM);
636     putop(c, OP_STARTSTR, getsel(f, UPB_HANDLER_STARTSTR));
637     putop(c, OP_STRING, getsel(f, UPB_HANDLER_STRING));
638     maybeput(c, OP_ENDSTR, h, f, UPB_HANDLER_ENDSTR);
639     putop(c, OP_POP);
640     putop(c, OP_SETDELIM);
641   }
642 }
643 
644 /* Generates bytecode to parse a single primitive field. */
generate_primitivefield(compiler * c,const upb_fielddef * f,upb_pbdecodermethod * method)645 static void generate_primitivefield(compiler *c, const upb_fielddef *f,
646                                     upb_pbdecodermethod *method) {
647   const upb_handlers *h = upb_pbdecodermethod_desthandlers(method);
648   upb_descriptortype_t descriptor_type = upb_fielddef_descriptortype(f);
649   opcode parse_type;
650   upb_selector_t sel;
651   int wire_type;
652 
653   label(c, LABEL_FIELD);
654 
655   /* From a decoding perspective, ENUM is the same as INT32. */
656   if (descriptor_type == UPB_DESCRIPTOR_TYPE_ENUM)
657     descriptor_type = UPB_DESCRIPTOR_TYPE_INT32;
658 
659   parse_type = (opcode)descriptor_type;
660 
661   /* TODO(haberman): generate packed or non-packed first depending on "packed"
662    * setting in the fielddef.  This will favor (in speed) whichever was
663    * specified. */
664 
665   UPB_ASSERT((int)parse_type >= 0 && parse_type <= OP_MAX);
666   sel = getsel(f, upb_handlers_getprimitivehandlertype(f));
667   wire_type = upb_pb_native_wire_types[upb_fielddef_descriptortype(f)];
668   if (upb_fielddef_isseq(f)) {
669     putop(c, OP_CHECKDELIM, LABEL_ENDMSG);
670     putchecktag(c, f, UPB_WIRE_TYPE_DELIMITED, LABEL_DISPATCH);
671    dispatchtarget(c, method, f, UPB_WIRE_TYPE_DELIMITED);
672     putop(c, OP_PUSHLENDELIM);
673     putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ));  /* Packed */
674    label(c, LABEL_LOOPSTART);
675     putop(c, parse_type, sel);
676     putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK);
677     putop(c, OP_BRANCH, -LABEL_LOOPSTART);
678    dispatchtarget(c, method, f, wire_type);
679     putop(c, OP_PUSHTAGDELIM, 0);
680     putop(c, OP_STARTSEQ, getsel(f, UPB_HANDLER_STARTSEQ));  /* Non-packed */
681    label(c, LABEL_LOOPSTART);
682     putop(c, parse_type, sel);
683     putop(c, OP_CHECKDELIM, LABEL_LOOPBREAK);
684     putchecktag(c, f, wire_type, LABEL_LOOPBREAK);
685     putop(c, OP_BRANCH, -LABEL_LOOPSTART);
686    label(c, LABEL_LOOPBREAK);
687     putop(c, OP_POP);  /* Packed and non-packed join. */
688     maybeput(c, OP_ENDSEQ, h, f, UPB_HANDLER_ENDSEQ);
689     putop(c, OP_SETDELIM);  /* Could remove for non-packed by dup ENDSEQ. */
690   } else {
691     putop(c, OP_CHECKDELIM, LABEL_ENDMSG);
692     putchecktag(c, f, wire_type, LABEL_DISPATCH);
693    dispatchtarget(c, method, f, wire_type);
694     putop(c, parse_type, sel);
695   }
696 }
697 
698 /* Adds bytecode for parsing the given message to the given decoderplan,
699  * while adding all dispatch targets to this message's dispatch table. */
compile_method(compiler * c,upb_pbdecodermethod * method)700 static void compile_method(compiler *c, upb_pbdecodermethod *method) {
701   const upb_handlers *h;
702   const upb_msgdef *md;
703   uint32_t* start_pc;
704   upb_msg_field_iter i;
705   upb_value val;
706 
707   UPB_ASSERT(method);
708 
709   /* Clear all entries in the dispatch table. */
710   upb_inttable_uninit(&method->dispatch);
711   upb_inttable_init(&method->dispatch, UPB_CTYPE_UINT64);
712 
713   h = upb_pbdecodermethod_desthandlers(method);
714   md = upb_handlers_msgdef(h);
715 
716  method->code_base.ofs = pcofs(c);
717   putop(c, OP_SETDISPATCH, &method->dispatch);
718   putsel(c, OP_STARTMSG, UPB_STARTMSG_SELECTOR, h);
719  label(c, LABEL_FIELD);
720   start_pc = c->pc;
721   for(upb_msg_field_begin(&i, md);
722       !upb_msg_field_done(&i);
723       upb_msg_field_next(&i)) {
724     const upb_fielddef *f = upb_msg_iter_field(&i);
725     upb_fieldtype_t type = upb_fielddef_type(f);
726 
727     if (type == UPB_TYPE_MESSAGE && !(haslazyhandlers(h, f) && c->lazy)) {
728       generate_msgfield(c, f, method);
729     } else if (type == UPB_TYPE_STRING || type == UPB_TYPE_BYTES ||
730                type == UPB_TYPE_MESSAGE) {
731       generate_delimfield(c, f, method);
732     } else {
733       generate_primitivefield(c, f, method);
734     }
735   }
736 
737   /* If there were no fields, or if no handlers were defined, we need to
738    * generate a non-empty loop body so that we can at least dispatch for unknown
739    * fields and check for the end of the message. */
740   if (c->pc == start_pc) {
741     /* Check for end-of-message. */
742     putop(c, OP_CHECKDELIM, LABEL_ENDMSG);
743     /* Unconditionally dispatch. */
744     putop(c, OP_DISPATCH, 0);
745   }
746 
747   /* For now we just loop back to the last field of the message (or if none,
748    * the DISPATCH opcode for the message). */
749   putop(c, OP_BRANCH, -LABEL_FIELD);
750 
751   /* Insert both a label and a dispatch table entry for this end-of-msg. */
752  label(c, LABEL_ENDMSG);
753   val = upb_value_uint64(pcofs(c) - method->code_base.ofs);
754   upb_inttable_insert(&method->dispatch, DISPATCH_ENDMSG, val);
755 
756   putsel(c, OP_ENDMSG, UPB_ENDMSG_SELECTOR, h);
757   putop(c, OP_RET);
758 
759   upb_inttable_compact(&method->dispatch);
760 }
761 
762 /* Populate "methods" with new upb_pbdecodermethod objects reachable from "h".
763  * Returns the method for these handlers.
764  *
765  * Generates a new method for every destination handlers reachable from "h". */
find_methods(compiler * c,const upb_handlers * h)766 static void find_methods(compiler *c, const upb_handlers *h) {
767   upb_value v;
768   upb_msg_field_iter i;
769   const upb_msgdef *md;
770   upb_pbdecodermethod *method;
771 
772   if (upb_inttable_lookupptr(&c->group->methods, h, &v))
773     return;
774 
775   method = newmethod(h, c->group);
776   upb_inttable_insertptr(&c->group->methods, h, upb_value_ptr(method));
777 
778   /* Find submethods. */
779   md = upb_handlers_msgdef(h);
780   for(upb_msg_field_begin(&i, md);
781       !upb_msg_field_done(&i);
782       upb_msg_field_next(&i)) {
783     const upb_fielddef *f = upb_msg_iter_field(&i);
784     const upb_handlers *sub_h;
785     if (upb_fielddef_type(f) == UPB_TYPE_MESSAGE &&
786         (sub_h = upb_handlers_getsubhandlers(h, f)) != NULL) {
787       /* We only generate a decoder method for submessages with handlers.
788        * Others will be parsed as unknown fields. */
789       find_methods(c, sub_h);
790     }
791   }
792 }
793 
794 /* (Re-)compile bytecode for all messages in "msgs."
795  * Overwrites any existing bytecode in "c". */
compile_methods(compiler * c)796 static void compile_methods(compiler *c) {
797   upb_inttable_iter i;
798 
799   /* Start over at the beginning of the bytecode. */
800   c->pc = c->group->bytecode;
801 
802   upb_inttable_begin(&i, &c->group->methods);
803   for(; !upb_inttable_done(&i); upb_inttable_next(&i)) {
804     upb_pbdecodermethod *method = upb_value_getptr(upb_inttable_iter_value(&i));
805     compile_method(c, method);
806   }
807 }
808 
set_bytecode_handlers(mgroup * g)809 static void set_bytecode_handlers(mgroup *g) {
810   upb_inttable_iter i;
811   upb_inttable_begin(&i, &g->methods);
812   for(; !upb_inttable_done(&i); upb_inttable_next(&i)) {
813     upb_pbdecodermethod *m = upb_value_getptr(upb_inttable_iter_value(&i));
814     upb_byteshandler *h = &m->input_handler_;
815 
816     m->code_base.ptr = g->bytecode + m->code_base.ofs;
817 
818     upb_byteshandler_setstartstr(h, upb_pbdecoder_startbc, m->code_base.ptr);
819     upb_byteshandler_setstring(h, upb_pbdecoder_decode, g);
820     upb_byteshandler_setendstr(h, upb_pbdecoder_end, m);
821   }
822 }
823 
824 
825 /* TODO(haberman): allow this to be constructed for an arbitrary set of dest
826  * handlers and other mgroups (but verify we have a transitive closure). */
mgroup_new(const upb_handlers * dest,bool lazy)827 const mgroup *mgroup_new(const upb_handlers *dest, bool lazy) {
828   mgroup *g;
829   compiler *c;
830 
831   g = newgroup();
832   c = newcompiler(g, lazy);
833   find_methods(c, dest);
834 
835   /* We compile in two passes:
836    * 1. all messages are assigned relative offsets from the beginning of the
837    *    bytecode (saved in method->code_base).
838    * 2. forwards OP_CALL instructions can be correctly linked since message
839    *    offsets have been previously assigned.
840    *
841    * Could avoid the second pass by linking OP_CALL instructions somehow. */
842   compile_methods(c);
843   compile_methods(c);
844   g->bytecode_end = c->pc;
845   freecompiler(c);
846 
847 #ifdef UPB_DUMP_BYTECODE
848   {
849     FILE *f = fopen("/tmp/upb-bytecode", "w");
850     UPB_ASSERT(f);
851     dumpbc(g->bytecode, g->bytecode_end, stderr);
852     dumpbc(g->bytecode, g->bytecode_end, f);
853     fclose(f);
854 
855     f = fopen("/tmp/upb-bytecode.bin", "wb");
856     UPB_ASSERT(f);
857     fwrite(g->bytecode, 1, g->bytecode_end - g->bytecode, f);
858     fclose(f);
859   }
860 #endif
861 
862   set_bytecode_handlers(g);
863   return g;
864 }
865 
866 
867 /* upb_pbcodecache ************************************************************/
868 
upb_pbcodecache_new(upb_handlercache * dest)869 upb_pbcodecache *upb_pbcodecache_new(upb_handlercache *dest) {
870   upb_pbcodecache *c = upb_gmalloc(sizeof(*c));
871 
872   if (!c) return NULL;
873 
874   c->dest = dest;
875   c->lazy = false;
876 
877   c->arena = upb_arena_new();
878   if (!upb_inttable_init(&c->groups, UPB_CTYPE_CONSTPTR)) return NULL;
879 
880   return c;
881 }
882 
upb_pbcodecache_free(upb_pbcodecache * c)883 void upb_pbcodecache_free(upb_pbcodecache *c) {
884   upb_inttable_iter i;
885 
886   upb_inttable_begin(&i, &c->groups);
887   for(; !upb_inttable_done(&i); upb_inttable_next(&i)) {
888     upb_value val = upb_inttable_iter_value(&i);
889     freegroup((void*)upb_value_getconstptr(val));
890   }
891 
892   upb_inttable_uninit(&c->groups);
893   upb_arena_free(c->arena);
894   upb_gfree(c);
895 }
896 
upb_pbdecodermethodopts_setlazy(upb_pbcodecache * c,bool lazy)897 void upb_pbdecodermethodopts_setlazy(upb_pbcodecache *c, bool lazy) {
898   UPB_ASSERT(upb_inttable_count(&c->groups) == 0);
899   c->lazy = lazy;
900 }
901 
upb_pbcodecache_get(upb_pbcodecache * c,const upb_msgdef * md)902 const upb_pbdecodermethod *upb_pbcodecache_get(upb_pbcodecache *c,
903                                                const upb_msgdef *md) {
904   upb_value v;
905   bool ok;
906   const upb_handlers *h;
907   const mgroup *g;
908 
909   h = upb_handlercache_get(c->dest, md);
910   if (upb_inttable_lookupptr(&c->groups, md, &v)) {
911     g = upb_value_getconstptr(v);
912   } else {
913     g = mgroup_new(h, c->lazy);
914     ok = upb_inttable_insertptr(&c->groups, md, upb_value_constptr(g));
915     UPB_ASSUME(ok);
916   }
917 
918   ok = upb_inttable_lookupptr(&g->methods, h, &v);
919   UPB_ASSUME(ok);
920   return upb_value_getptr(v);
921 }
922