1 /*************************************************************************/
2 /* Copyright (c) 2004                                                    */
3 /* Daniel Sleator, David Temperley, and John Lafferty                    */
4 /* Copyright (c) 2014 Linas Vepstas                                      */
5 /* All rights reserved                                                   */
6 /*                                                                       */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software.    */
9 /* This license allows free redistribution and use in source and binary  */
10 /* forms, with or without modification, subject to certain conditions.   */
11 /*                                                                       */
12 /*************************************************************************/
13 
14 /* This file is somewhat misnamed, as everything here defines the
15  * link-private, internal-use-only "api", which is subject to change
16  * from revision to revision. No external code should link to this
17  * stuff.
18  */
19 
20 /*****************************************************************************
21 *
22 * NOTE: There are five basic "types" used within the link parser.  These are:
23 *
24 *       Dictionary, Parse_Options, Sentence, Linkage, PostProcessor
25 *
26 * To make the use of the API simpler, each of these is typedef'ed as a pointer
27 * to a data structure.  As a result, some of the code may look a little funny,
28 * since it uses pointers in a way that is syntactically inconsistent.  After
29 * working a bit with these basic types enough, this should not be confusing.
30 *
31 ******************************************************************************/
32 
33 #ifndef _API_STRUCTURESH_
34 #define _API_STRUCTURESH_
35 
36 #include <stdint.h>
37 
38 /* For locale_t */
39 #ifdef HAVE_LOCALE_T_IN_LOCALE_H
40 #include <locale.h>
41 #endif /* HAVE_LOCALE_T_IN_LOCALE_H */
42 #ifdef HAVE_LOCALE_T_IN_XLOCALE_H
43 #include <xlocale.h>
44 #endif /* HAVE_LOCALE_T_IN_XLOCALE_H */
45 
46 #include "api-types.h"
47 #include "dict-common/dialect.h"
48 #include "tracon-set.h"
49 #include "memory-pool.h"
50 #include "string-set.h"
51 
52 /* Performance tuning.
53  * For short sentences, encoding takes more resources than it saves. If
54  * this overhead is improved, this limit can be set lower. */
55 #define SENTENCE_MIN_LENGTH_TRAILING_HASH 6
56 
57 /* Pruning per null-count is costly for sentences whose parsing time
58  * is relatively small. If a better pruning per null-count is implemented,
59  * this limit can be set lower. */
60 #define SENTENCE_MIN_LENGTH_MULTI_PRUNING 30
61 
62 typedef struct Cost_Model_s Cost_Model;
63 struct Cost_Model_s
64 {
65 	Cost_Model_type type;
66 	int (*compare_fn)(Linkage, Linkage);
67 };
68 
69 struct Resources_s
70 {
71 	int    max_parse_time;  /* in seconds */
72 	size_t max_memory;      /* in bytes */
73 	double time_when_parse_started;
74 	size_t space_when_parse_started;
75 	double when_created;
76 	double when_last_called;
77 	double cumulative_time;
78 	bool   memory_exhausted;
79 	bool   timer_expired;
80 };
81 
82 struct Parse_Options_s
83 {
84 	/* General options */
85 	short verbosity;       /* Level of detail to give about the computation 0 */
86 	char * debug;          /* Comma-separated function names to debug "" */
87 	char * test;           /* Comma-separated features to test "" */
88 	Resources resources;   /* For deciding when to abort the parsing */
89 
90 	/* Options governing the tokenizer (sentence-splitter) */
91 	short use_spell_guess; /* Up to this many spell-guesses per unknown word 7 */
92 
93 	/* Choice of the parser to use */
94 	bool use_sat_solver;   /* Use the Boolean SAT based parser */
95 
96 	/* Options governing the parser internals operation */
97 	double disjunct_cost;  /* Max disjunct cost to allow */
98 	short min_null_count;  /* The minimum number of null links to allow */
99 	short max_null_count;  /* The maximum number of null links to allow */
100 	bool islands_ok;       /* If TRUE, then linkages with islands
101 	                          (separate component of the link graph)
102 	                          will be generated (default=FALSE) */
103 	size_t short_length;   /* Links that are limited in length can be
104 	                          no longer than this.  Default = 16 */
105 	bool all_short;        /* If true, there can be no connectors that are exempt */
106 	bool repeatable_rand;  /* Reset rand number gen after every parse. */
107 
108 	/* Options governing post-processing */
109 	bool perform_pp_prune; /* Perform post-processing-based pruning TRUE */
110 	size_t twopass_length; /* Min sent length for two-pass post processing */
111 	Cost_Model cost_model; /* For sorting linkages after parsing. */
112 
113 	/* Options governing the generation of linkages. */
114 	size_t linkage_limit;  /* The maximum number of linkages processed 100 */
115 	bool display_morphology;/* If true, print morpho analysis of words TRUE */
116 
117 	/* Options governing the dictionary interpretation. */
118 	dialect_info dialect;
119 };
120 
121 typedef struct word_queue_s word_queue_t;
122 struct word_queue_s
123 {
124 	Gword *word;
125 	word_queue_t *next;
126 };
127 
128 struct Sentence_s
129 {
130 	Dictionary  dict;           /* Words are defined from this dictionary */
131 	const char *orig_sentence;  /* Copy of original sentence */
132 	size_t length;              /* Number of words */
133 	Word  *word;                /* Array of words after tokenization */
134 	String_set *   string_set;  /* Used for assorted strings */
135 	Pool_desc * fm_Match_node;  /* Fast-matcher Match_node memory pool */
136 	Pool_desc * Table_connector_pool; /* Count memoizing memory pool */
137 	Pool_desc * Exp_pool;
138 	Pool_desc * X_node_pool;
139 	Pool_desc * Disjunct_pool;
140 	Pool_desc * Connector_pool;
141 
142 	/* Connector encoding, packing & sharing. */
143 	size_t min_len_encoding;     /* Encode from this sentence length. */
144 	void *dc_memblock;           /* For packed disjuncts & connectors. */
145 	unsigned int num_disjuncts;  /* Number of disjuncts in dc_memblock. */
146 
147 	/* Wordgraph stuff. FIXME: create stand-alone struct for these. */
148 	Gword *wordgraph;            /* Tokenization wordgraph */
149 	Gword *last_word;            /* FIXME Last issued word */
150 	word_queue_t *word_queue;    /* Element in queue of words to tokenize */
151 	word_queue_t *word_queue_last;
152 	size_t gword_node_num;       /* Debug - for differentiating between
153 	                                wordgraph nodes with identical subwords. */
154 
155 	size_t min_len_multi_pruning; /* Do it from this sentence length. */
156 
157 	/* Parse results */
158 	int    num_linkages_found;  /* Total number before postprocessing.  This
159 	                               is returned by the do_count() function */
160 	size_t num_linkages_alloced;/* Total number of linkages allocated.
161 	                               the number post-processed might be fewer
162 	                               because some are non-canonical */
163 	size_t num_linkages_post_processed;
164 	                            /* The number of linkages that are actually
165 	                               put into the array that was alloced.
166 	                               This is not the same as num alloced
167 	                               because some may be non-canonical. */
168 	size_t num_valid_linkages;  /* Number with no pp violations */
169 	unsigned int null_count;    /* Number of null links in linkages */
170 	Linkage        lnkages;     /* Sorted array of valid & invalid linkages */
171 	Postprocessor * postprocessor;
172 	Postprocessor * constituent_pp;
173 
174 	/* thread-safe random number state */
175 	unsigned int rand_state;
176 
177 #ifdef USE_SAT_SOLVER
178 	void *hook;                 /* Hook for the SAT solver */
179 #endif /* USE_SAT_SOLVER */
180 };
181 
182 #endif
183