1 /*************************************************************************/ 2 /* Copyright (c) 2004 */ 3 /* Daniel Sleator, David Temperley, and John Lafferty */ 4 /* Copyright (c) 2013 Linas Vepstas */ 5 /* Copyright (c) 2014 Amir Plivatsky */ 6 /* All rights reserved */ 7 /* */ 8 /* Use of the link grammar parsing system is subject to the terms of the */ 9 /* license set forth in the LICENSE file included with this software. */ 10 /* This license allows free redistribution and use in source and binary */ 11 /* forms, with or without modification, subject to certain conditions. */ 12 /* */ 13 /*************************************************************************/ 14 15 #ifndef _TOK_STRUCTURES_H_ 16 #define _TOK_STRUCTURES_H_ 17 18 #include <stddef.h> 19 #include "api-types.h" 20 #include "link-includes.h" 21 22 // TODO provide gword access methods! 23 24 /* An ordered set of gword pointers, used to indicate the source gword 25 * (Wordgraph word) of disjuncts and connectors. Usually it contains only 26 * one element. However, when a duplicate disjunct is eliminated (see 27 * eliminate_duplicate_disjuncts()) and it originated from a different 28 * gword (a relatively rare event) its gword is added to the gword_set of 29 * the remaining disjunct. A set of 3 elements is extremely rare. The 30 * original order is preserved, in a hope for better caching on 31 * alternatives match checks in fast-match.c. 32 * 33 * Memory management: 34 * A copy-on-write semantics is used when constructing a new gword_set. It 35 * means that all the gword sets with one element are shared. These gword 36 * sets are part of the Gword structure. Copied and added element are 37 * alloc'ed and chained. The result is that the chain_next of the gword 38 * sets that are part of each gword contains the list of alloc'ed elements - 39 * to be used in gword_set_delete() called *only* in sentence_delete(). 40 * This ensures that the gword_set of connectors doesn't get stale when 41 * their disjuncts are deleted and later restored in one-step parse when 42 * min_null_count=0 and max_null count>0 (see classic_parse()). 43 */ 44 struct gword_set 45 { 46 Gword *o_gword; 47 struct gword_set *next; 48 struct gword_set *chain_next; 49 }; 50 51 52 typedef enum 53 { 54 MT_INVALID, /* Zero, to be changed to the correct type */ 55 MT_WORD, /* Regular word */ 56 MT_FEATURE, /* Pseudo morpheme, currently capitalization marks */ 57 MT_INFRASTRUCTURE, /* Start and end Wordgraph pseudo-words */ 58 MT_WALL, /* The LEFT-WALL and RIGHT-WALL pseudo-words */ 59 MT_EMPTY, /* Empty word FIXME: Remove it. */ 60 MT_UNKNOWN, /* Unknown word (FIXME? Unused) */ 61 /* Experimental for Semitic languages (yet unused) */ 62 MT_TEMPLATE, 63 MT_ROOT, 64 /* Experimental - for display purposes. 65 * MT_CONTR is now used in the tokenization step, see the comments there. */ 66 MT_CONTR, /* Contracted part of a contraction (e.g. y', 's) */ 67 MT_PUNC, /* Punctuation */ 68 /* We are not going to have >63 types up to here. */ 69 MT_STEM = 1<<6, /* Stem */ 70 MT_PREFIX = 1<<7, /* Prefix */ 71 MT_MIDDLE = 1<<8, /* Middle morpheme (yet unused) */ 72 MT_SUFFIX = 1<<9 /* Suffix */ 73 } Morpheme_type; 74 #define IS_REG_MORPHEME (MT_STEM|MT_PREFIX|MT_MIDDLE|MT_SUFFIX) 75 76 /* Word status */ 77 /* - Tokenization */ 78 #define WS_UNKNOWN (1<<0) /* Unknown word */ 79 #define WS_REGEX (1<<1) /* Matches a regex */ 80 #define WS_SPELL (1<<2) /* Result of a spell guess */ 81 #define WS_RUNON (1<<3) /* Separated from words run-on */ 82 #define WS_HASALT (1<<4) /* Has alternatives (one or more)*/ 83 #define WS_UNSPLIT (1<<5) /* It's an alternative to itself as an unsplit word */ 84 #define WS_INDICT (1<<6) /* dict_has_word() is true */ 85 #define WS_FIRSTUPPER (1<<7) /* Subword is the lc version of its unsplit_word. 86 The original word can be restored if needed 87 through this unsplit_word. */ 88 /* - Post linkage stage. */ 89 #define WS_PL (1<<14) /* Post-Linkage, not belonging to tokenization */ 90 91 #define WS_GUESS (WS_SPELL|WS_RUNON|WS_REGEX) 92 93 /*Only TS_DONE is actually used. */ 94 typedef enum 95 { 96 TS_INITIAL, 97 TS_LR_STRIP, 98 TS_AFFIX_SPLIT, 99 TS_REGEX, 100 TS_RUNON, 101 TS_SPELL, 102 TS_DONE /* Tokenization done */ 103 } Tokenizing_step; 104 105 /* For the "guess" field of Gword_struct. */ 106 typedef enum 107 { 108 GM_REGEX = '!', 109 GM_SPELL = '~', 110 GM_RUNON = '&', 111 GM_UNKNOWN = '?' 112 } Guess_mark; 113 114 #define MAX_SPLITS 10 /* See split_counter below */ 115 116 struct Gword_struct 117 { 118 const char *subword; 119 const char *start; /* subword start position. */ 120 const char *end; /* subword end position. */ 121 122 Gword *unsplit_word; /* Upward-going co-tree */ 123 Gword **next; /* Right-going tree */ 124 Gword **prev; /* Left-going tree */ 125 Gword *chain_next; /* Next word in the chain of all words */ 126 127 /* Disjuncts and connectors point back to their originating Gword(s). */ 128 gword_set gword_set_head; 129 130 /* Used by sane_linkage_morphism() and remove_empty_words() for 131 * locating optional words that should be removed. */ 132 WordIdx sent_wordidx; /* Index in the 2D sentence word array. */ 133 134 /* For debug and inspiration. */ 135 const char *label; /* Debug label - code locations of tokenization */ 136 size_t node_num; /* A sequential number, assigned in the order in 137 which word splits are done. Shown in the 138 Wordgraph display and in debug messages for 139 easier differentiating words with identical 140 subwords, and for indicating their split order. 141 Also used in set_connector_hash(). Could have 142 been used for hier_position instead of pointers 143 in order to optimize its generation and 144 comparison. */ 145 146 /* Tokenizer state */ 147 Tokenizing_step tokenizing_step; 148 bool issued_unsplit; /* The word has been issued as an alternative to itself. 149 It will become an actual alternative to itself only 150 if it's not the sole alternative, in which case it 151 will be marked with WS_UNSPLIT. */ 152 size_t split_counter; /* Incremented on splits. A word cannot split more than 153 MAX_SPLITS times and a warning is issued then. */ 154 155 unsigned int status; /* See WS_* */ 156 Morpheme_type morpheme_type; /* See MT_* */ 157 Gword *alternative_id; /* Alternative start - a unique identifier of 158 the alternative to which the word belongs. */ 159 const char *regex_name; /* Subword matches this regex. 160 FIXME? Extend for multiple regexes. */ 161 162 /* Only used by wordgraph_flatten() */ 163 const Gword **hier_position; /* Unsplit_word/alternative_id pointer list, up 164 to the original sentence word. */ 165 size_t hier_depth; /* Number of pointer pairs in hier_position */ 166 167 /* XXX Experimental. Only used after the linkage (by compute_chosen_words()) 168 * for an element in the linkage display wordgraph path that represents 169 * a block of null words that are morphemes of the same word. */ 170 Gword **null_subwords; /* Null subwords represented by this word */ 171 }; 172 173 /* Wordgraph path word-positions, 174 * used in wordgraph_flatten() and sane_linkage_morphism(). 175 * FIXME Separate to two different structures. */ 176 struct Wordgraph_pathpos_s 177 { 178 Gword *word; /* Position in the Wordgraph */ 179 /* Only for wordgraph_flatten(). */ 180 bool same_word; /* Still the same word - mark it as "optional" */ 181 bool next_ok; /* OK to proceed to the next Wordgraph word */ 182 bool used; /* Debug - the word has been issued */ 183 /* Only for sane_morphism(). */ 184 const Gword **path; /* Linkage candidate wordgraph path */ 185 }; 186 #endif 187