1 #ifndef MLR_DSL_CST_H
2 #define MLR_DSL_CST_H
3 
4 #include "cli/mlrcli.h"
5 #include "lib/context.h"
6 #include "containers/lhmsmv.h"
7 #include "containers/local_stack.h"
8 #include "containers/loop_stack.h"
9 #include "containers/type_decl.h"
10 #include "dsl/mlr_dsl_ast.h"
11 #include "dsl/mlr_dsl_blocked_ast.h"
12 #include "dsl/rval_evaluators.h"
13 #include "dsl/rxval_evaluators.h"
14 #include "dsl/function_manager.h"
15 #include "output/multi_out.h"
16 #include "output/multi_lrec_writer.h"
17 
18 // ================================================================
19 // Concrete syntax tree (CST) derived from an abstract syntax tree (AST).
20 //
21 // Statements are of the form:
22 //
23 // * Assignment of mlrval (i.e. result of expression evaluation, e.g. $name or f($x,$y)) to oosvar (out-of-stream
24 // variables, prefixed with @ sigil)
25 //
26 // * Assignment to srec (in-stream records, with field names prefixed with $ sigil)
27 //
28 // * Copying full srec ($* syntax) to/from an oosvar
29 //
30 // * Oosvar-to-oosvar assignments (recursively if RHS is non-terminal)
31 //
32 // * pattern-action statements: boolean expression with curly-braced statements which are executed only
33 //   when the boolean evaluates to true.
34 //
35 // * bare-boolean statements: no-ops unless they have side effects: namely, the matches/does-not-match
36 //   operators =~ and !=~ setting regex captures \1, \2, etc.
37 //
38 // * emit statements: these place oosvar key-value pairs into the output stream.  These can be of the following forms:
39 //
40 //   o 'emit @a; emit @b' which produce separate records such as a=3 and b=4
41 //
42 //   o 'emitf @a, @b' which produce records such as a=3,b=4
43 //
44 //   o For nested maps, 'emit @c, "x", "y"' in which case the first two map levels are pulled out and named "x" and "y"
45 //   in separate fields. See containers/mlhmmv.h for more information.
46 //
47 // Further, these statements are organized into three groups:
48 //
49 // * begin: executed once, before the first input record is read.
50 // * main:  executed for each input record.
51 // * end:   executed once, after the last input record is read.
52 //
53 // The exceptions being, of course, assignment to/from srec is disallowed for begin/end statements since those occur
54 // before/after stream processing, respectively.
55 // ================================================================
56 
57 // ----------------------------------------------------------------
58 // dsl/mlr_dsl_stack_allocate.c
59 // Two-pass stack allocator which operates on the block-structured AST
60 // before the CST is build (mlr_dsl_stack_allocate.c).
61 void blocked_ast_allocate_locals(blocked_ast_t* paast, int trace);
62 
63 // ----------------------------------------------------------------
64 // Forward references for virtual-function prototypes
65 struct _mlr_dsl_cst_t;
66 struct _mlr_dsl_cst_statement_t;
67 struct _subr_defsite_t;
68 
69 // Parameter bag to reduce parameter-marshaling
70 typedef struct _cst_outputs_t {
71 	int*    pshould_emit_rec;
72 	sllv_t* poutrecs;
73 	char*   oosvar_flatten_separator;
74 	cli_writer_opts_t* pwriter_opts;
75 } cst_outputs_t;
76 
77 // ----------------------------------------------------------------
78 typedef struct _cst_statement_block_t {
79 	int subframe_var_count;
80 	sllv_t* pstatements;
81 } cst_statement_block_t;
82 
83 cst_statement_block_t* cst_statement_block_alloc(int subframe_var_count);
84 void cst_statement_block_free(cst_statement_block_t* pblock, context_t* pctx);
85 
86 // ----------------------------------------------------------------
87 typedef struct _cst_top_level_statement_block_t {
88 	local_stack_frame_t* pframe;
89 	int max_var_depth;
90 	cst_statement_block_t* pblock;
91 } cst_top_level_statement_block_t;
92 
93 cst_top_level_statement_block_t* cst_top_level_statement_block_alloc(int max_var_depth, int subframe_var_count);
94 void cst_top_level_statement_block_free(cst_top_level_statement_block_t* pblock, context_t* pctx);
95 
96 // ----------------------------------------------------------------
97 // Generic handler for a statement.
98 
99 // Handler for statement lists: begin/main/end; cond/if/for/while/do-while.
100 typedef void mlr_dsl_cst_block_handler_t(
101 	cst_statement_block_t* pblock,
102 	variables_t*           pvars,
103 	cst_outputs_t*         pcst_outputs);
104 
105 // ----------------------------------------------------------------
106 // mlr_dsl_cst_statement_t is a base class extended by all manner of subclasses.
107 // The following are for their method pointers.
108 typedef struct _mlr_dsl_cst_statement_t* mlr_dsl_cst_statement_allocator_t(
109 	struct _mlr_dsl_cst_t* pcst,
110 	mlr_dsl_ast_node_t*    pnode,
111 	int                    type_inferencing,
112 	int                    context_flags);
113 
114 typedef void mlr_dsl_cst_statement_handler_t(
115 	struct _mlr_dsl_cst_statement_t* pstatement,
116 	variables_t*                     pvars,
117 	cst_outputs_t*                   pcst_outputs);
118 
119 typedef void mlr_dsl_cst_statement_freer_t(
120 	struct _mlr_dsl_cst_statement_t* pstatement,
121 	context_t* pctx);
122 
123 // ----------------------------------------------------------------
124 // MLR_DSL_CST_STATEMENT OBJECT
125 
126 typedef struct _mlr_dsl_cst_statement_t {
127 
128 	//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
129 	// Common to most or all statement types:
130 
131 	// For trace-mode.
132 	mlr_dsl_ast_node_t* past_node;
133 
134 	// Function-pointer for the handler of the given statement type, e.g. srec-assignment, while-loop, etc.
135 	mlr_dsl_cst_statement_handler_t* pstatement_handler;
136 
137 	// Subclass destructor. It should free whatever's in the pvstate but it should not
138 	// free the pstatement itself.
139 	mlr_dsl_cst_statement_freer_t* pstatement_freer;
140 
141 	// The reason for this being a function pointer is that there are two variants of
142 	// statement-list handlers: one for inside loop bodies which has to check
143 	// break/continue flags after each statement, and another for outside loop bodies
144 	// which doesn't need to check those. (This is a micro-optimization.) For bodyless
145 	// statements (e.g. assignment) this is null.
146 	cst_statement_block_t* pblock;
147 	mlr_dsl_cst_block_handler_t* pblock_handler;
148 
149 	//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
150 	// Specific to each statement type:
151 
152 	void* pvstate;
153 
154 } mlr_dsl_cst_statement_t;
155 
156 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
157 // For use by the statement-subclass constructors
158 
159 mlr_dsl_cst_statement_t* mlr_dsl_cst_statement_valloc(
160 	mlr_dsl_ast_node_t*                    past_node,
161 	mlr_dsl_cst_statement_handler_t*       pstatement_handler,
162 	mlr_dsl_cst_statement_freer_t*         pstatement_freer,
163 	void*                                  pvstate);
164 
165 mlr_dsl_cst_statement_t* mlr_dsl_cst_statement_valloc_with_block(
166 	mlr_dsl_ast_node_t*                    past_node,
167 	mlr_dsl_cst_statement_handler_t*       pstatement_handler,
168 	cst_statement_block_t*                 pblock,
169 	mlr_dsl_cst_block_handler_t*           pblock_handler,
170 	mlr_dsl_cst_statement_freer_t*         pstatement_freer,
171 	void*                                  pvstate);
172 
173 // ----------------------------------------------------------------
174 // MLR_DSL_CST OBJECT
175 
176 typedef struct _mlr_dsl_cst_t {
177 	sllv_t* pbegin_blocks;
178 	cst_top_level_statement_block_t* pmain_block;
179 	sllv_t* pend_blocks;
180 
181 	// Function manager for built-in functions as well as user-defined functions (which are CST-specific).
182 	fmgr_t* pfmgr;
183 
184 	// Subroutine bodies
185 	lhmsv_t* psubr_defsites;
186 
187 	// Subroutine callsites, used to bootstrap (e.g. subroutine f calls subroutine g before the latter
188 	// has been defined).
189 	sllv_t* psubr_callsite_statements_to_resolve;
190 
191 	// fflush on emit/tee/print/dump
192 	int flush_every_record;
193 
194 	// The CST object retains the AST pointer (in order to reuse its strings etc. with minimal copying)
195 	// and will free the AST in the CST destructor.
196 	blocked_ast_t* paast;
197 } mlr_dsl_cst_t;
198 
199 // ----------------------------------------------------------------
200 // CONSTRUCTORS/DESTRUCTORS/METHODS
201 
202 // Notes:
203 // * do_final_filter is FALSE for mlr put, TRUE for mlr filter.
204 // * negate_final_filter is TRUE for mlr filter -x.
205 // * The CST object strips nodes off the raw AST, constructed by the Lemon parser, in order
206 //   to do analysis on it. Nonetheless the caller should free what's left.
207 mlr_dsl_cst_t* mlr_dsl_cst_alloc(mlr_dsl_ast_t* past, int print_ast, int trace_stack_allocation,
208 	int type_inferencing, int flush_every_record, int do_final_filter, int negate_final_filter);
209 
210 mlr_dsl_cst_statement_t* mlr_dsl_cst_alloc_statement(mlr_dsl_cst_t* pcst, mlr_dsl_ast_node_t* pnode,
211 	int type_inferencing, int context_flags);
212 
213 mlr_dsl_cst_statement_t* mlr_dsl_cst_alloc_final_filter_statement(mlr_dsl_cst_t* pcst,
214 	mlr_dsl_ast_node_t* pnode, int negate_final_filter, int type_inferencing, int context_flags);
215 
216 void mlr_dsl_cst_free(mlr_dsl_cst_t* pcst, context_t* pctx);
217 void mlr_dsl_cst_statement_free(mlr_dsl_cst_statement_t* pstatement, context_t* pctx);
218 
219 // Top-level entry point, e.g. from mapper_put.
220 void mlr_dsl_cst_handle_top_level_statement_blocks(
221 	sllv_t*      ptop_level_blocks, // block bodies for begins, main, ends
222 	variables_t* pvars,
223 	cst_outputs_t* pcst_outputs);
224 
225 void mlr_dsl_cst_handle_top_level_statement_block(
226 	cst_top_level_statement_block_t* ptop_level_block,
227 	variables_t* pvars,
228 	cst_outputs_t* pcst_outputs);
229 
230 // Recursive entry point: block bodies for begin, main, end; cond, if, for, while.
231 void mlr_dsl_cst_handle_statement_block(
232 	cst_statement_block_t* pblock,
233 	variables_t*           pvars,
234 	cst_outputs_t*         pcst_outputs);
235 
236 void mlr_dsl_cst_handle_statement_block_with_break_continue(
237 	cst_statement_block_t* pblock,
238 	variables_t*           pvars,
239 	cst_outputs_t*         pcst_outputs);
240 
241 // Statement lists which are not curly-braced bodies: start/continuation/update statements for triple-for.
242 void mlr_dsl_cst_handle_statement_list(
243 	sllv_t*        pstatements,
244 	variables_t*   pvars,
245 	cst_outputs_t* pcst_outputs);
246 
247 // ================================================================
248 // dsl/mlr_dsl_cst_func_subr.c
249 
250 // ----------------------------------------------------------------
251 // cst_udf_state_t is data needed to execute the body of a user-defined function which is implemented by CST statements.
252 // udf_defsite_state_t is data needed for any user-defined function (no matter how implemented).
253 typedef struct _cst_udf_state_t {
254 	char*     name;
255 	int       arity;
256 	char**    parameter_names;
257 	int*      parameter_type_masks;
258 	cst_top_level_statement_block_t* ptop_level_block;
259 	char*     return_value_type_name;
260 	int       return_value_type_mask;
261 } cst_udf_state_t;
262 
263 udf_defsite_state_t* mlr_dsl_cst_alloc_udf(
264 	mlr_dsl_cst_t*      pcst,
265 	mlr_dsl_ast_node_t* pnode,
266 	int                 type_inferencing,
267 	int                 context_flags);
268 
269 void mlr_dsl_cst_free_udf(cst_udf_state_t* pstate, context_t* pctx);
270 
271 // ----------------------------------------------------------------
272 
273 typedef struct _subr_callsite_t {
274 	char* name;
275 	int   arity;
276 	int   type_inferencing;
277 	int   context_flags;
278 } subr_callsite_t;
279 
280 typedef struct _subr_defsite_t {
281 	char*     name;
282 	int       arity;
283 	char**    parameter_names;
284 	int*      parameter_type_masks;
285 	cst_top_level_statement_block_t* ptop_level_block;
286 } subr_defsite_t;
287 
288 subr_defsite_t* mlr_dsl_cst_alloc_subroutine(
289 	mlr_dsl_cst_t*      pcst,
290 	mlr_dsl_ast_node_t* pnode,
291 	int                 type_inferencing,
292 	int                 context_flags);
293 
294 void mlr_dsl_cst_free_subroutine(subr_defsite_t* psubr_defsite, context_t* pctx);
295 
296 // Invoked directly from the CST statement handler for a subroutine callsite.
297 // (Functions, by contrast, are invoked by callback from the right-hand-site-evaluator logic
298 // -- hence no execute-function method here.)
299 void mlr_dsl_cst_execute_subroutine(subr_defsite_t* pstate, variables_t* pvars,
300 	cst_outputs_t* pcst_outputs, int callsite_arity, boxed_xval_t* args);
301 
302 // ================================================================
303 // For on-line help / manpage
304 // dsl/mlr_dsl_cst_keywords.c
305 
306 void mlr_dsl_list_all_keywords_raw(FILE* output_stream);
307 
308 // Pass function_name == NULL to get usage for all keywords:
309 void mlr_dsl_keyword_usage(FILE* output_stream, char* keyword);
310 
311 // ================================================================
312 // Specific CST-statement subclasses
313 
314 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
315 // dsl/mlr_dsl_cst_condish_statements.c
316 mlr_dsl_cst_statement_allocator_t alloc_conditional_block;
317 mlr_dsl_cst_statement_allocator_t alloc_if_head;
318 mlr_dsl_cst_statement_allocator_t alloc_while;
319 mlr_dsl_cst_statement_allocator_t alloc_do_while;
320 mlr_dsl_cst_statement_allocator_t alloc_bare_boolean;
321 
322 mlr_dsl_cst_statement_t* alloc_filter(
323 	mlr_dsl_cst_t*      pcst,
324 	mlr_dsl_ast_node_t* pnode,
325 	int                 type_inferencing,
326 	int                 context_flags);
327 
328 mlr_dsl_cst_statement_t* alloc_final_filter(
329 	mlr_dsl_cst_t*      pcst,
330 	mlr_dsl_ast_node_t* pnode,
331 	int                 negate_final_filter,
332 	int                 type_inferencing,
333 	int                 context_flags);
334 
335 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
336 // dsl/mlr_dsl_cst_terminal_assignment_statements.c
337 mlr_dsl_cst_statement_allocator_t alloc_srec_assignment;
338 mlr_dsl_cst_statement_allocator_t alloc_indirect_srec_assignment;
339 mlr_dsl_cst_statement_allocator_t alloc_positional_srec_name_assignment;
340 mlr_dsl_cst_statement_allocator_t alloc_env_assignment;
341 
342 // dsl/mlr_dsl_cst_map_assignment_statements.c
343 mlr_dsl_cst_statement_allocator_t alloc_full_srec_assignment;
344 mlr_dsl_cst_statement_t* alloc_local_variable_definition(
345 	mlr_dsl_cst_t*      pcst,
346 	mlr_dsl_ast_node_t* pnode,
347 	int                 type_inferencing,
348 	int                 context_flags,
349 	int                 type_mask);
350 mlr_dsl_cst_statement_allocator_t alloc_nonindexed_local_variable_assignment;
351 mlr_dsl_cst_statement_allocator_t alloc_indexed_local_variable_assignment;
352 mlr_dsl_cst_statement_allocator_t alloc_oosvar_assignment;
353 mlr_dsl_cst_statement_allocator_t alloc_full_oosvar_assignment;
354 
355 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
356 // dsl/mlr_dsl_cst_unset_statements.c
357 mlr_dsl_cst_statement_allocator_t alloc_unset;
358 
359 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
360 // dsl/mlr_dsl_cst_for_srec_statements.c
361 mlr_dsl_cst_statement_allocator_t alloc_for_srec;
362 mlr_dsl_cst_statement_allocator_t alloc_for_srec_key_only;
363 
364 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
365 // dsl/mlr_dsl_cst_for_map_statements.c
366 mlr_dsl_cst_statement_allocator_t alloc_for_map;
367 mlr_dsl_cst_statement_allocator_t alloc_for_map_key_only;
368 
369 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
370 // dsl/mlr_dsl_cst_triple_for_statements.c
371 mlr_dsl_cst_statement_allocator_t alloc_triple_for;
372 
373 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
374 // dsl/mlr_dsl_cst_loop_control_statements.c
375 mlr_dsl_cst_statement_allocator_t alloc_break;
376 mlr_dsl_cst_statement_allocator_t alloc_continue;
377 
378 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
379 // dsl/mlr_dsl_cst_return_statements.c
380 mlr_dsl_cst_statement_allocator_t alloc_return_void;  // For subroutines
381 mlr_dsl_cst_statement_allocator_t alloc_return_value; // For UDFs
382 
383 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
384 // dsl/mlr_dsl_cst_output_statements.c
385 
386 mlr_dsl_cst_statement_t* alloc_print(
387 	mlr_dsl_cst_t*      pcst,
388 	mlr_dsl_ast_node_t* pnode,
389 	int                 type_inferencing,
390 	int                 context_flags,
391 	char*               print_terminator);
392 
393 mlr_dsl_cst_statement_allocator_t alloc_tee;
394 
395 mlr_dsl_cst_statement_allocator_t alloc_emitf;
396 
397 mlr_dsl_cst_statement_t* alloc_emit(
398 	mlr_dsl_cst_t*      pcst,
399 	mlr_dsl_ast_node_t* pnode,
400 	int                 type_inferencing,
401 	int                 context_flags,
402 	int                 do_full_prefixing);
403 
404 mlr_dsl_cst_statement_t* alloc_emit_lashed(
405 	mlr_dsl_cst_t*      pcst,
406 	mlr_dsl_ast_node_t* pnode,
407 	int                 type_inferencing,
408 	int                 context_flags,
409 	int                 do_full_prefixing);
410 
411 mlr_dsl_cst_statement_allocator_t alloc_dump;
412 
413 //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
414 // dsl/mlr_dsl_cst_func_subr.c
415 
416 // When we allocate a callsite we can do so before the callee has been defined.
417 // Hence the two-step process, with the second step being an object-binding step.
418 mlr_dsl_cst_statement_allocator_t alloc_subr_callsite_statement;
419 void mlr_dsl_cst_resolve_subr_callsite(mlr_dsl_cst_t* pcst, mlr_dsl_cst_statement_t* pstatement);
420 
421 #endif // MLR_DSL_CST_H
422