1 //  crm114_structs.h  - structures for CRM114
2 
3 // Copyright 2009 William S. Yerazunis.
4 // This file is under GPLv3, as described in COPYING.
5 
6 #ifndef __CRM114_STRUCTS_H__
7 #define __CRM114_STRUCTS_H__
8 
9 //    These are systemwide globals.  Sure, they should go into a global
10 //     struct, but that realization only occured to me in 2008.  Sorry.
11 
12 extern long vht_size;
13 extern long max_pgmsize;
14 extern long user_trace;
15 extern long internal_trace;
16 extern long debug_countdown;
17 extern long cmdline_break;
18 extern long cycle_counter;
19 extern long ignore_environment_vars;
20 extern long data_window_size;
21 
22 // Number of hash table buckets.  Set from command line, read (only)
23 // by classifier learns.
24 extern long sparse_spectrum_file_length;
25 
26 extern long microgroom_chain_length ;
27 extern long microgroom_stop_after;
28 
29 extern float min_pmax_pmin_ratio;
30 extern long profile_execution;
31 
32 extern int dontcare;
33 
34 extern long prettyprint_listing;  //  0= none, 1 = basic, 2 = expanded, 3 = parsecode
35 
36 extern long engine_exit_base;  //  All internal errors will use this number or higher;
37                        //  the user programs can use lower numbers freely.
38 
39 
40 //        how should math be handled?
41 //        = 0 no extended (non-EVAL) math, use algebraic notation
42 //        = 1 no extended (non-EVAL) math, use RPN
43 //        = 2 extended (everywhere) math, use algebraic notation
44 //        = 3 extended (everywhere) math, use RPN
45 extern long q_expansion_mode;
46 
47 
48 //   structure of a vht cell
49 //  note - each file gets an entry, with the name of the file
50 //  being the name of the variable - no colons!
51 //
52 //  also note that there's no "next" pointer in a vht cell; this is because
53 //  we do in-table overflowing (if a table entry is in use, we use the next
54 //  available table entry, wrapping around.  It's easy to change in any case.
55 //
56 typedef struct mythical_vht_cell {
57   char *filename;       // file where defined (or NULL)
58   int filedesc;         // filedesc of defining file (or NULL)
59   char *nametxt;        // block of text that hosts the variable name
60   long nstart;          // index into nametxt to start of varname
61   long nlen;            // length of name
62   char *valtxt;         // text block that hosts the captured value
63                         // vstart, vlen, mstart, and mlen are all measured
64                         // from the _start_ of valtxt, mstart relative to
65                         // vstart, etc!!!
66   long vstart;          // zero-base index of start of variable (inclusive)
67   long vlen;            // length of captured value : this plus vstart is where
68                         //  you could put a NULL if you wanted to.
69   long mstart;          // zero-base start of most recent match of this var
70   long mlen;            // length of most recent match against this var; this
71                         //   plus mstart is where you could put a NULL if you
72                         //    wanted to.
73   long linenumber;      // linenumber of this variable (if known, else -1)
74   long lazy_redirects;  // how many lazy redirects are allowed (0 by default);
75 } VHT_CELL;
76 
77 //   The argparse block is filled in at run time, though at least in
78 //    principle it could be done at microcompile time, but var-expansion
79 //     needs to be done at statement execution time..  so we don't fill it
80 //      in till we have to, then we cache the result.
81 //
82 
83 
84 typedef struct mythical_argparse_block {
85   char *a1start;
86   long a1len;
87   char *p1start;
88   long p1len;
89   char *p2start;
90   long p2len;
91   char *p3start;
92   long p3len;
93   char *b1start;
94   long b1len;
95   char *s1start;
96   long s1len;
97   char *s2start;
98   long s2len;
99   unsigned long long sflags;
100 } ARGPARSE_BLOCK;
101 
102 
103 
104 // structure of a microcompile table cell (one such per statement)
105 //
106 //  These table entries get filled in during microcompile operation.
107 //
108 typedef struct mythical_mct_cell {
109   char *hosttxt;         // text file this statement lives in.
110   ARGPARSE_BLOCK *apb;   // the argparse block for this statement
111   long start;            // zero-base index of start of statement (inclusive)
112   long fchar;            // zero-base index of non-blank stmt (for prettyprint)
113   long achar;            // zero-base index of start of args;
114   long stmt_utime;       // user time spent in this statement line;
115   long stmt_stime;       // system time spent in this statement line;
116   int stmt_type;         // statement type of this line
117   int nest_level;        // nesting level of this statement
118   int fail_index;        // if this statement failed, where would we go?
119   int liaf_index;        // if this statement liafed, where would we go?
120   int trap_index;        // if this statement faults, where would we go?
121   int stmt_break;        // 1 if "break" on this stmt, 0 otherwise.
122 } MCT_CELL;
123 
124 // structure of a control stack level cell.
125 //   Nota Bene:  CSL cells are used to both retain toplevel data about
126 //   any particular file being executed as well as being used to retain
127 //   data on any file that is data!  If a file is executable, then the
128 //   mct pointer is a pointer to the compiled MCT table, else the mct
129 //   pointer is a NULL and the file is not executable.
130 //
131 typedef struct mythical_csl_cell {
132   char *filename;        //filename if any
133   long rdwr;             // 0=readonly, 1=rdwr
134   long filedes;          //  file descriptor it's open on (if any)
135   char *filetext;        //  text buffer
136   long nchars;           //  characters of data we have
137   unsigned int hash;     //  hash of this data (if done)
138   MCT_CELL **mct;        //  microcompile (if compiled)
139   long nstmts;           //  how many statements in the microcompile
140   long preload_window;   //  do we preload the window or not?
141   long cstmt;            //  current executing statement of this file
142   void *caller;          //  pointer to this file's caller (if any)
143   long return_vht_cell;  //  index into the VHT to stick the return value
144   long calldepth;        //  how many calls deep is this stack frame
145   long aliusstk[MAX_BRACKETDEPTH]; // the status stack for ALIUS
146 } CSL_CELL;
147 
148 //     A 1024-byte standardized header for our statistical files (well, the
149 //     new standard.  Old file types don't have this.  Forward migration
150 //     shall take place.  :-)
151 
152 typedef struct {
153   uint32_t start;
154   uint32_t length;
155   uint32_t tag;
156 } STATISTICS_FILE_CHUNK;
157 
158 typedef struct {
159   uint8_t file_ident_string [ STATISTICS_FILE_IDENT_STRING_MAX ];
160                             //  Text description of this file.  This should
161                             //  always start with "CRM114 Classdata " and then
162                             //  the classifier name etc.  Embed versioning
163                             //  information here (and get it back with strtod)
164                             //  Please pad unused space with NULLs; don't
165                             //  change the length (to make file-magic easier).
166                             //  This is always chunks[0].
167                             //
168   STATISTICS_FILE_CHUNK chunks [ STATISTICS_FILE_NCHUNKS ];
169                             //  The byte indexed chunks of data in this file,
170                             //  by start, length, and tag.
171                             //  chunks[1] points to this array itself.
172                             //  A -1 length means "to the
173                             //  end of the file"
174                             //
175   ////////////////////////////
176   //      Following in the file are more data chunks.  Note that there's
177   //      plenty of space here for pre-solves (such as an SVM might generate)
178   //      but probably NOT enough space for individual examples to get their
179   //      own chunks, unless you change the default number of chunks upward
180   //      from 1024.
181   ////////////////////////////
182 }   STATISTICS_FILE_HEADER_STRUCT;
183 
184 typedef struct {
185   unsigned int hash;
186   unsigned int key;
187   unsigned int value;
188 } FEATUREBUCKET_STRUCT;
189 
190 
191 typedef struct {
192   unsigned char version[4];
193   unsigned long flags;
194   unsigned long skip_to;
195 } FEATURE_HEADER_STRUCT;
196 
197 
198 typedef struct {
199   unsigned int hash;
200   unsigned int key;
201   float value;
202 } WINNOW_FEATUREBUCKET_STRUCT;
203 
204 #define ENTROPY_RESERVED_HEADER_LEN 1024
205 typedef struct {
206   long firlatstart;
207   long firlatlen;
208   long nodestart;
209   long nodeslen;
210   long long totalbits;
211 } ENTROPY_HEADER_STRUCT;
212 
213 typedef struct mythical_entropy_alphabet_slot {
214   long count;
215   long nextcell;
216 } ENTROPY_ALPHABET_SLOT;
217 
218 //  28 byte header, 24 bytes alph (52 tot).  Pare: 16 header, 16 alph (36 tot)
219 typedef struct mythical_entropy_cell {
220   double fir_prior;
221   long fir_larger;
222   long fir_smaller;
223   long firlat_slot;
224   //  long total_count;
225   ENTROPY_ALPHABET_SLOT abet[ENTROPY_ALPHABET_SIZE];
226 } ENTROPY_FEATUREBUCKET_STRUCT;
227 
228 
229 //   TMS struct - used for measurng process time.
230 typedef struct mythical_tms_struct {
231   clock_t tms_utime;  // user time
232   clock_t tms_stime;  // system time
233   clock_t tms_cutime; // user time of children
234   clock_t tms_cstime; // system time of children
235 } TMS_STRUCT;
236 
237 
238 //  define statement types for microcompile
239 //
240 #define CRM_BOGUS 0
241 #define CRM_NOOP 1
242 #define CRM_EXIT 2
243 #define CRM_OPENBRACKET 3
244 #define CRM_CLOSEBRACKET 4
245 #define CRM_LABEL 5
246 #define CRM_GOTO 6
247 #define CRM_MATCH 7
248 #define CRM_FAIL 8
249 #define CRM_LIAF 9
250 #define CRM_ACCEPT 10
251 #define CRM_TRAP 11
252 #define CRM_FAULT 12
253 #define CRM_INPUT 13
254 #define CRM_OUTPUT 14
255 #define CRM_WINDOW 15
256 #define CRM_ALTER 16
257 #define CRM_CALL 17
258 #define CRM_ROUTINE 18
259 #define CRM_RETURN 19
260 #define CRM_SYSCALL 20
261 #define CRM_LEARN 21
262 #define CRM_CLASSIFY 22
263 #define CRM_ISOLATE 23
264 #define CRM_HASH 24
265 #define CRM_INTERSECT 25
266 #define CRM_UNION 26
267 #define CRM_EVAL 27
268 #define CRM_ALIUS 28
269 #define CRM_TRANSLATE 29
270 #define CRM_DEBUG 30
271 #define CRM_CLUMP 31         // make clusters out of tokens
272 #define CRM_PMULC 32         // pmulc translates tokens to cluster names
273 #define CRM_LAZY 33          // makes a "lazy" variable.
274 #define CRM_UNIMPLEMENTED 34
275 
276 
277 //      FLAGS FLAGS FLAGS
278 //       all of the valid CRM114 flags are listed here
279 //
280 //      GROT GROT GROT - You must keep this in synchrony with the
281 //      definitions of the keywords in crm_stmt_parser!!!  Yes, I'd
282 //      love to define it in one place and one place only, but I haven't
283 //      figured out a way to do that well.
284 
285 //     match searchstart flags
286 #define CRM_FROMSTART     (1LLU << 0)
287 #define CRM_FROMNEXT      (1LLU << 1)
288 #define CRM_FROMEND       (1LLU << 2)
289 #define CRM_NEWEND        (1LLU << 3)
290 #define CRM_FROMCURRENT   (1LLU << 4)
291 //         match control flags
292 #define CRM_NOCASE        (1LLU << 5)
293 #define CRM_ABSENT        (1LLU << 6)
294 #define CRM_BASIC         (1LLU << 7)
295 #define CRM_BACKWARDS     (1LLU << 8)
296 #define CRM_LITERAL       (1LLU << 9)
297 #define CRM_NOMULTILINE   (1LLU << 10)
298 //         input/output/window flags
299 #define CRM_BYLINE        CRM_NOMULTILINE
300 #define CRM_BYCHAR        (1LLU << 11)
301 #define CRM_STRING        CRM_BYCHAR     // string is bychar.  I think...
302 #define CRM_BYCHUNK       (1LLU << 12)
303 #define CRM_BYEOF         (1LLU << 13)
304 #define CRM_EOFACCEPTS    (1LLU << 14)
305 #define CRM_EOFRETRY      (1LLU << 15)
306 #define CRM_APPEND        (1LLU << 16)
307 //           process control flags
308 #define CRM_KEEP          (1LLU << 17)
309 #define CRM_ASYNC         (1LLU << 18)
310 //        learn and classify
311 #define CRM_REFUTE        (1LLU << 19)
312 #define CRM_MICROGROOM    (1LLU << 20)
313 #define CRM_MARKOVIAN     (1LLU << 21)
314 #define CRM_OSB_BAYES     (1LLU << 22)       // synonym with OSB feature gen
315 #define CRM_OSB           CRM_OSB_BAYES
316 #define CRM_CORRELATE     (1LLU << 23)
317 #define CRM_OSB_WINNOW    (1LLU << 24)      //  synonym to Winnow feature combiner
318 #define CRM_WINNOW        CRM_OSB_WINNOW
319 #define CRM_CHI2          (1LLU << 25)
320 #define CRM_UNIQUE        (1LLU << 26)
321 #define CRM_ENTROPY       (1LLU << 27)
322 #define CRM_OSBF          (1LLU << 28)     // synonym with OSBF local rule
323 #define CRM_OSBF_BAYES    CRM_OSBF
324 #define CRM_HYPERSPACE    (1LLU << 29)
325 #define CRM_UNIGRAM       (1LLU << 30)
326 #define CRM_CROSSLINK     (1LLU << 31)
327 //
328 //        Flags that need to be sorted back in
329 //           input
330 #define CRM_READLINE      (1LLU << 32)
331 //           isolate flags
332 #define CRM_DEFAULT       (1LLU << 33)
333 //           SKS classifier
334 #define CRM_SKS          (1LLU << 34)
335 //           SVM classifier
336 #define CRM_SVM           (1LLU << 35)
337 //           FSCM classifier
338 #define CRM_FSCM          (1LLU << 36)
339 //           Neural Net classifier
340 #define CRM_NEURAL_NET    (1LLU << 37)
341 //
342 #define CRM_ERASE         (1LLU << 38)
343 //PCA classifier
344 #define CRM_PCA           (1LLU << 39)
345 //     and a struct to put them in.
346 typedef struct
347 {
348   char * string;
349   unsigned long long value;
350 } FLAG_DEF ;
351 
352 
353 //*****************************************************************
354 //
355 //     The following table describes the statements allowed in CRM114.
356 //
357 //      Each entry is one line of STMT_TABLE_TYPE, and gives the text
358 //      representation of the command, the internal dispatch code,
359 //      whether the statement is "executable" or not, what the minimum
360 //      and maximum number of slash-groups, paren-groups, and box-groups
361 //      are for the statement to make sense, and what flags are allowed
362 //      for that statement.
363 //
364 
365 typedef struct
366 {
367   char *stmt_name;
368   int stmt_code;
369   int namelen;
370   int is_executable;
371   int minslashes;
372   int maxslashes;
373   int minparens;
374   int maxparens;
375   int minboxes;
376   int maxboxes;
377   long flags_allowed_mask;
378 } STMT_TABLE_TYPE;
379 
380 
381 //     The compiler file actually contains this "for real", the
382 //      extern here is merely a reference to it.
383 //
384 #ifndef BASE_COMPILER_TABLE_HERE
385 extern STMT_TABLE_TYPE stmt_table[];
386 #endif
387 
388 
389 //   these defines are for arg type... note that they must remain synched
390 //   IN THIS ORDER with the start chars and end chars in crm_statement_parse
391 //
392 #define CRM_ANGLES 0
393 #define CRM_PARENS 1
394 #define CRM_BOXES  2
395 #define CRM_SLASHES 3
396 
397 
398 
399 //   The possible exit codes
400 #define CRM_EXIT_OK 0
401 #define CRM_EXIT_ERROR 1
402 #define CRM_EXIT_FATAL 2
403 #define CRM_EXIT_APOCALYPSE 666
404 
405 
406 //   The ORable exec codes for crm_zexpandvar; OR together the ones
407 //   you want to enable for zexpandvar.  Nexpandvar is ansi|stringvar|redirect,
408 //   and qexpandvar is "all of them".  :)
409 #define CRM_EVAL_ANSI               0x01
410 #define CRM_EVAL_STRINGVAR          0x02
411 #define CRM_EVAL_REDIRECT           0x04
412 #define CRM_EVAL_STRINGLEN          0x08
413 #define CRM_EVAL_MATH               0x10
414 
415 
416 
417 //    The possible cache actions
418 #define CRM_MMAP_CACHE_UNUSED 0
419 //   active makes it really mapped (or reactivates a released mmap)
420 #define CRM_MMAP_CACHE_ACTIVE 1
421 //   release marks the slot reusable, but doesn't unmap (yet)
422 #define CRM_MMAP_CACHE_RELEASE 2
423 //   drop really unmaps
424 #define CRM_MMAP_CACHE_DROP 3
425 
426 
427 #endif	// !__CRM114_STRUCTS_H__
428