1 /*************************************************************************/
2 /* Copyright (c) 2004                                                    */
3 /* Daniel Sleator, David Temperley, and John Lafferty                    */
4 /* Copyright (c) 2013 Linas Vepstas                                      */
5 /* Copyright (c) 2014 Amir Plivatsky                                     */
6 /* All rights reserved                                                   */
7 /*                                                                       */
8 /* Use of the link grammar parsing system is subject to the terms of the */
9 /* license set forth in the LICENSE file included with this software.    */
10 /* This license allows free redistribution and use in source and binary  */
11 /* forms, with or without modification, subject to certain conditions.   */
12 /*                                                                       */
13 /*************************************************************************/
14 
15 #ifndef _TOK_STRUCTURES_H_
16 #define _TOK_STRUCTURES_H_
17 
18 #include <stddef.h>
19 #include "api-types.h"
20 #include "link-includes.h"
21 
22 // TODO provide gword access methods!
23 
24 /* An ordered set of gword pointers, used to indicate the source gword
25  * (Wordgraph word) of disjuncts and connectors. Usually it contains only
26  * one element.  However, when a duplicate disjunct is eliminated (see
27  * eliminate_duplicate_disjuncts()) and it originated from a different
28  * gword (a relatively rare event) its gword is added to the gword_set of
29  * the remaining disjunct. A set of 3 elements is extremely rare. The
30  * original order is preserved, in a hope for better caching on
31  * alternatives match checks in fast-match.c.
32  *
33  * Memory management:
34  * A copy-on-write semantics is used when constructing a new gword_set.  It
35  * means that all the gword sets with one element are shared.  These gword
36  * sets are part of the Gword structure. Copied and added element are
37  * alloc'ed and chained. The result is that the chain_next of the gword
38  * sets that are part of each gword contains the list of alloc'ed elements -
39  * to be used in gword_set_delete() called *only* in sentence_delete().
40  * This ensures that the gword_set of connectors doesn't get stale when
41  * their disjuncts are deleted and later restored in one-step parse when
42  * min_null_count=0 and max_null count>0 (see classic_parse()).
43  */
44 struct gword_set
45 {
46 	Gword *o_gword;
47 	struct gword_set *next;
48 	struct gword_set *chain_next;
49 };
50 
51 
52 typedef enum
53 {
54 	MT_INVALID,            /* Zero, to be changed to the correct type */
55 	MT_WORD,               /* Regular word */
56 	MT_FEATURE,            /* Pseudo morpheme, currently capitalization marks */
57 	MT_INFRASTRUCTURE,     /* Start and end Wordgraph pseudo-words */
58 	MT_WALL,               /* The LEFT-WALL and RIGHT-WALL pseudo-words */
59 	MT_EMPTY,              /* Empty word FIXME: Remove it. */
60 	MT_UNKNOWN,            /* Unknown word (FIXME? Unused) */
61 	/* Experimental for Semitic languages (yet unused) */
62 	MT_TEMPLATE,
63 	MT_ROOT,
64 	/* Experimental - for display purposes.
65 	 * MT_CONTR is now used in the tokenization step, see the comments there. */
66 	MT_CONTR,              /* Contracted part of a contraction (e.g. y', 's) */
67 	MT_PUNC,               /* Punctuation */
68 	/* We are not going to have >63 types up to here. */
69 	MT_STEM    = 1<<6,     /* Stem */
70 	MT_PREFIX  = 1<<7,     /* Prefix */
71 	MT_MIDDLE  = 1<<8,     /* Middle morpheme (yet unused) */
72 	MT_SUFFIX  = 1<<9      /* Suffix */
73 } Morpheme_type;
74 #define IS_REG_MORPHEME (MT_STEM|MT_PREFIX|MT_MIDDLE|MT_SUFFIX)
75 
76 /* Word status */
77 /* - Tokenization */
78 #define WS_UNKNOWN (1<<0) /* Unknown word */
79 #define WS_REGEX   (1<<1) /* Matches a regex */
80 #define WS_SPELL   (1<<2) /* Result of a spell guess */
81 #define WS_RUNON   (1<<3) /* Separated from words run-on */
82 #define WS_HASALT  (1<<4) /* Has alternatives (one or more)*/
83 #define WS_UNSPLIT (1<<5) /* It's an alternative to itself as an unsplit word */
84 #define WS_INDICT  (1<<6) /* dict_has_word() is true */
85 #define WS_FIRSTUPPER (1<<7) /* Subword is the lc version of its unsplit_word.
86                                 The original word can be restored if needed
87                                 through this unsplit_word. */
88 /* - Post linkage stage. */
89 #define WS_PL      (1<<14) /* Post-Linkage, not belonging to tokenization */
90 
91 #define WS_GUESS (WS_SPELL|WS_RUNON|WS_REGEX)
92 
93 /*Only TS_DONE is actually used. */
94 typedef enum
95 {
96 	TS_INITIAL,
97 	TS_LR_STRIP,
98 	TS_AFFIX_SPLIT,
99 	TS_REGEX,
100 	TS_RUNON,
101 	TS_SPELL,
102 	TS_DONE                  /* Tokenization done */
103 } Tokenizing_step;
104 
105 /* For the "guess" field of Gword_struct. */
106 typedef enum
107 {
108 	GM_REGEX = '!',
109 	GM_SPELL = '~',
110 	GM_RUNON = '&',
111 	GM_UNKNOWN = '?'
112 } Guess_mark;
113 
114 #define MAX_SPLITS 10   /* See split_counter below */
115 
116 struct Gword_struct
117 {
118 	const char *subword;
119 	const char *start;   /* subword start position. */
120 	const char *end;     /* subword end position. */
121 
122 	Gword *unsplit_word; /* Upward-going co-tree */
123 	Gword **next;        /* Right-going tree */
124 	Gword **prev;        /* Left-going tree */
125 	Gword *chain_next;   /* Next word in the chain of all words */
126 
127 	/* Disjuncts and connectors point back to their originating Gword(s). */
128 	gword_set gword_set_head;
129 
130 	/* Used by sane_linkage_morphism() and remove_empty_words() for
131 	 * locating optional words that should be removed. */
132 	WordIdx sent_wordidx; /* Index in the 2D sentence word array. */
133 
134 	/* For debug and inspiration. */
135 	const char *label;   /* Debug label - code locations of tokenization */
136 	size_t node_num;     /* A sequential number, assigned in the order in
137 	                        which word splits are done. Shown in the
138 	                        Wordgraph display and in debug messages for
139 	                        easier differentiating words with identical
140 	                        subwords, and for indicating their split order.
141 	                        Also used in set_connector_hash().  Could have
142 	                        been used for hier_position instead of pointers
143 	                        in order to optimize its generation and
144 	                        comparison. */
145 
146 	/* Tokenizer state */
147 	Tokenizing_step tokenizing_step;
148 	bool issued_unsplit; /* The word has been issued as an alternative to itself.
149 	                        It will become an actual alternative to itself only
150 	                        if it's not the sole alternative, in which case it
151 	                        will be marked with WS_UNSPLIT. */
152 	size_t split_counter; /* Incremented on splits. A word cannot split more than
153 	                         MAX_SPLITS times and a warning is issued then. */
154 
155 	unsigned int status;         /* See WS_* */
156 	Morpheme_type morpheme_type; /* See MT_* */
157 	Gword *alternative_id;       /* Alternative start - a unique identifier of
158 	                                the alternative to which the word belongs. */
159 	const char *regex_name;      /* Subword matches this regex.
160                                    FIXME? Extend for multiple regexes. */
161 
162 	/* Only used by wordgraph_flatten() */
163 	const Gword **hier_position; /* Unsplit_word/alternative_id pointer list, up
164                                    to the original sentence word. */
165 	size_t hier_depth;           /* Number of pointer pairs in hier_position */
166 
167 	/* XXX Experimental. Only used after the linkage (by compute_chosen_words())
168 	 * for an element in the linkage display wordgraph path that represents
169 	 * a block of null words that are morphemes of the same word. */
170 	Gword **null_subwords;       /* Null subwords represented by this word */
171 };
172 
173 /* Wordgraph path word-positions,
174  * used in wordgraph_flatten() and sane_linkage_morphism().
175  * FIXME Separate to two different structures. */
176 struct Wordgraph_pathpos_s
177 {
178 	Gword *word;      /* Position in the Wordgraph */
179 	/* Only for wordgraph_flatten(). */
180 	bool same_word;   /* Still the same word - mark it as "optional" */
181 	bool next_ok;     /* OK to proceed to the next Wordgraph word */
182 	bool used;        /* Debug - the word has been issued */
183 	/* Only for sane_morphism(). */
184 	const Gword **path; /* Linkage candidate wordgraph path */
185 };
186 #endif
187