1 /*************************************************************************/ 2 /* Copyright (c) 2004 */ 3 /* Daniel Sleator, David Temperley, and John Lafferty */ 4 /* Copyright (c) 2014 Linas Vepstas */ 5 /* All rights reserved */ 6 /* */ 7 /* Use of the link grammar parsing system is subject to the terms of the */ 8 /* license set forth in the LICENSE file included with this software. */ 9 /* This license allows free redistribution and use in source and binary */ 10 /* forms, with or without modification, subject to certain conditions. */ 11 /* */ 12 /*************************************************************************/ 13 14 /* This file is somewhat misnamed, as everything here defines the 15 * link-private, internal-use-only "api", which is subject to change 16 * from revision to revision. No external code should link to this 17 * stuff. 18 */ 19 20 /***************************************************************************** 21 * 22 * NOTE: There are five basic "types" used within the link parser. These are: 23 * 24 * Dictionary, Parse_Options, Sentence, Linkage, PostProcessor 25 * 26 * To make the use of the API simpler, each of these is typedef'ed as a pointer 27 * to a data structure. As a result, some of the code may look a little funny, 28 * since it uses pointers in a way that is syntactically inconsistent. After 29 * working a bit with these basic types enough, this should not be confusing. 30 * 31 ******************************************************************************/ 32 33 #ifndef _API_STRUCTURESH_ 34 #define _API_STRUCTURESH_ 35 36 #include <stdint.h> 37 38 /* For locale_t */ 39 #ifdef HAVE_LOCALE_T_IN_LOCALE_H 40 #include <locale.h> 41 #endif /* HAVE_LOCALE_T_IN_LOCALE_H */ 42 #ifdef HAVE_LOCALE_T_IN_XLOCALE_H 43 #include <xlocale.h> 44 #endif /* HAVE_LOCALE_T_IN_XLOCALE_H */ 45 46 #include "api-types.h" 47 #include "dict-common/dialect.h" 48 #include "tracon-set.h" 49 #include "memory-pool.h" 50 #include "string-set.h" 51 52 /* Performance tuning. 53 * For short sentences, encoding takes more resources than it saves. If 54 * this overhead is improved, this limit can be set lower. */ 55 #define SENTENCE_MIN_LENGTH_TRAILING_HASH 6 56 57 /* Pruning per null-count is costly for sentences whose parsing time 58 * is relatively small. If a better pruning per null-count is implemented, 59 * this limit can be set lower. */ 60 #define SENTENCE_MIN_LENGTH_MULTI_PRUNING 30 61 62 typedef struct Cost_Model_s Cost_Model; 63 struct Cost_Model_s 64 { 65 Cost_Model_type type; 66 int (*compare_fn)(Linkage, Linkage); 67 }; 68 69 struct Resources_s 70 { 71 int max_parse_time; /* in seconds */ 72 size_t max_memory; /* in bytes */ 73 double time_when_parse_started; 74 size_t space_when_parse_started; 75 double when_created; 76 double when_last_called; 77 double cumulative_time; 78 bool memory_exhausted; 79 bool timer_expired; 80 }; 81 82 struct Parse_Options_s 83 { 84 /* General options */ 85 short verbosity; /* Level of detail to give about the computation 0 */ 86 char * debug; /* Comma-separated function names to debug "" */ 87 char * test; /* Comma-separated features to test "" */ 88 Resources resources; /* For deciding when to abort the parsing */ 89 90 /* Options governing the tokenizer (sentence-splitter) */ 91 short use_spell_guess; /* Up to this many spell-guesses per unknown word 7 */ 92 93 /* Choice of the parser to use */ 94 bool use_sat_solver; /* Use the Boolean SAT based parser */ 95 96 /* Options governing the parser internals operation */ 97 double disjunct_cost; /* Max disjunct cost to allow */ 98 short min_null_count; /* The minimum number of null links to allow */ 99 short max_null_count; /* The maximum number of null links to allow */ 100 bool islands_ok; /* If TRUE, then linkages with islands 101 (separate component of the link graph) 102 will be generated (default=FALSE) */ 103 size_t short_length; /* Links that are limited in length can be 104 no longer than this. Default = 16 */ 105 bool all_short; /* If true, there can be no connectors that are exempt */ 106 bool repeatable_rand; /* Reset rand number gen after every parse. */ 107 108 /* Options governing post-processing */ 109 bool perform_pp_prune; /* Perform post-processing-based pruning TRUE */ 110 size_t twopass_length; /* Min sent length for two-pass post processing */ 111 Cost_Model cost_model; /* For sorting linkages after parsing. */ 112 113 /* Options governing the generation of linkages. */ 114 size_t linkage_limit; /* The maximum number of linkages processed 100 */ 115 bool display_morphology;/* If true, print morpho analysis of words TRUE */ 116 117 /* Options governing the dictionary interpretation. */ 118 dialect_info dialect; 119 }; 120 121 typedef struct word_queue_s word_queue_t; 122 struct word_queue_s 123 { 124 Gword *word; 125 word_queue_t *next; 126 }; 127 128 struct Sentence_s 129 { 130 Dictionary dict; /* Words are defined from this dictionary */ 131 const char *orig_sentence; /* Copy of original sentence */ 132 size_t length; /* Number of words */ 133 Word *word; /* Array of words after tokenization */ 134 String_set * string_set; /* Used for assorted strings */ 135 Pool_desc * fm_Match_node; /* Fast-matcher Match_node memory pool */ 136 Pool_desc * Table_connector_pool; /* Count memoizing memory pool */ 137 Pool_desc * Exp_pool; 138 Pool_desc * X_node_pool; 139 Pool_desc * Disjunct_pool; 140 Pool_desc * Connector_pool; 141 142 /* Connector encoding, packing & sharing. */ 143 size_t min_len_encoding; /* Encode from this sentence length. */ 144 void *dc_memblock; /* For packed disjuncts & connectors. */ 145 unsigned int num_disjuncts; /* Number of disjuncts in dc_memblock. */ 146 147 /* Wordgraph stuff. FIXME: create stand-alone struct for these. */ 148 Gword *wordgraph; /* Tokenization wordgraph */ 149 Gword *last_word; /* FIXME Last issued word */ 150 word_queue_t *word_queue; /* Element in queue of words to tokenize */ 151 word_queue_t *word_queue_last; 152 size_t gword_node_num; /* Debug - for differentiating between 153 wordgraph nodes with identical subwords. */ 154 155 size_t min_len_multi_pruning; /* Do it from this sentence length. */ 156 157 /* Parse results */ 158 int num_linkages_found; /* Total number before postprocessing. This 159 is returned by the do_count() function */ 160 size_t num_linkages_alloced;/* Total number of linkages allocated. 161 the number post-processed might be fewer 162 because some are non-canonical */ 163 size_t num_linkages_post_processed; 164 /* The number of linkages that are actually 165 put into the array that was alloced. 166 This is not the same as num alloced 167 because some may be non-canonical. */ 168 size_t num_valid_linkages; /* Number with no pp violations */ 169 unsigned int null_count; /* Number of null links in linkages */ 170 Linkage lnkages; /* Sorted array of valid & invalid linkages */ 171 Postprocessor * postprocessor; 172 Postprocessor * constituent_pp; 173 174 /* thread-safe random number state */ 175 unsigned int rand_state; 176 177 #ifdef USE_SAT_SOLVER 178 void *hook; /* Hook for the SAT solver */ 179 #endif /* USE_SAT_SOLVER */ 180 }; 181 182 #endif 183