1 // Copyright (C) 2012  Davis E. King (davis@dlib.net)
2 // License: Boost Software License   See LICENSE.txt for the full license.
3 #undef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
4 #ifdef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
5 
6 #include <vector>
7 #include <string>
8 #include "../algs.h"
9 #include "../serialize.h"
10 
11 namespace dlib
12 {
13 
14 // -----------------------------------------------------------------------------------------
15 
16     template <
17         typename T
18         >
19     struct constituent
20     {
21         /*!
22             WHAT THIS OBJECT REPRESENTS
23                 This object represents the linguistic idea of a constituent, that is, a
24                 group of words that functions as a single unit.  In particular, it
25                 represents a combination of two constituents into a new constituent.
26 
27                 Additionally, a constituent object represents a range of words relative to
28                 some std::vector of words.  The range is from [begin, end) (i.e. including
29                 begin but not including end, so using the normal C++ iterator notation).
30                 Moreover, a constituent is always composed of two parts, each having a tag.
31                 Therefore, the left part is composed of the words in the range [begin,k)
32                 and has tag left_tag while the right part of the constituent contains the
33                 words in the range [k,end) and has the tag right_tag.
34 
35                 The tags are user defined objects of type T.  In general, they are used to
36                 represent syntactic categories such as noun phrase, verb phrase, etc.
37         !*/
38 
39         unsigned long begin, end, k;
40         T left_tag;
41         T right_tag;
42     };
43 
44     template <
45         typename T
46         >
47     void serialize(
48         const constituent<T>& item,
49         std::ostream& out
50     );
51     /*!
52         provides serialization support
53     !*/
54 
55     template <
56         typename T
57         >
58     void deserialize(
59         constituent<T>& item,
60         std::istream& in
61     );
62     /*!
63         provides deserialization support
64     !*/
65 
66 // -----------------------------------------------------------------------------------------
67 
68     /*!A END_OF_TREE is used to indicate that parse_tree_element::left or
69          parse_tree_element::right doesn't point to another subtree.
70     !*/
71     const unsigned long END_OF_TREE = 0xFFFFFFFF;
72 
73 // -----------------------------------------------------------------------------------------
74 
75     template <
76         typename T
77         >
78     struct parse_tree_element
79     {
80         /*!
81             WHAT THIS OBJECT REPRESENTS
82                 This object is used to represent a node in a binary parse tree.  An entire
83                 parse tree is represented by a std::vector of parse_tree_element objects.
84                 We follow the convention that the first element of this vector is always
85                 the root of the entire tree.
86 
87                 The fields of this object have the following interpretations:
88                     - c == the constituent spanned by this node in the parse tree.
89                       Therefore, the node spans the words in the range [c.begin, c.end).
90                     - tag == the syntactic category of this node in the parse tree.
91                     - score == the score or log likelihood for this parse tree.  In
92                       general, this is the sum of scores of all the production rules used
93                       to build the tree rooted at the current node.
94                     - let PT denote the vector of parse_tree_elements that defines an
95                       entire parse tree.  Then we have:
96                         - if (left != END_OF_TREE) then
97                             - PT[left] == the left sub-tree of the current node.
98                             - PT[left] spans the words [c.begin, c.k)
99                             - PT[left].tag == c.left_tag
100                         - else
101                             - there is no left sub-tree
102 
103                         - if (right != END_OF_TREE) then
104                             - PT[right] == the right sub-tree of the current node.
105                             - PT[right] spans the words [c.k, c.end)
106                             - PT[right].tag == c.right_tag
107                         - else
108                             - there is no right sub-tree
109         !*/
110 
111         constituent<T> c;
112         T tag;
113         double score;
114 
115         unsigned long left;
116         unsigned long right;
117     };
118 
119     template <
120         typename T
121         >
122     void serialize (
123         const parse_tree_element<T>& item,
124         std::ostream& out
125     );
126     /*!
127         provides serialization support
128     !*/
129 
130     template <
131         typename T
132         >
133     void deserialize (
134         parse_tree_element<T>& item,
135         std::istream& in
136     );
137     /*!
138         provides deserialization support
139     !*/
140 
141 // -----------------------------------------------------------------------------------------
142 // -----------------------------------------------------------------------------------------
143 
144     void example_production_rule_function (
145         const std::vector<T>& words,
146         const constituent<T>& c,
147         std::vector<std::pair<T,double> >& possible_tags
148     )
149     /*!
150         requires
151             - 0 <= c.begin < c.k < c.end <= words.size()
152             - possible_tags.size() == 0
153         ensures
154             - Finds all the syntactic categories that can be used to label c and puts those
155               categories, along with their scores, into possible_tags.  Or in other words,
156               this function determines which production rules can be used to turn the left
157               and right sub-constituents in c into a single constituent.  The contents of c
158               have the following interpretations:
159                 - The left sub-constituent has syntactic category c.left_tag
160                 - for all i such that c.begin <= i < c.k:
161                     - words[i] is part of the left sub-constituent.
162                 - The right sub-constituent has syntactic category c.right_tag
163                 - for all i such that c.k <= i < c.end:
164                     - words[i] is part of the right sub-constituent.
165 
166             - Note that example_production_rule_function() is not a real function.  It is
167               here just to show you how to define production rule producing functions for
168               use with the find_max_parse_cky() routine defined below.
169     !*/
170 
171     template <
172         typename T,
173         typename production_rule_function
174         >
175     void find_max_parse_cky (
176         const std::vector<T>& words,
177         const production_rule_function& production_rules,
178         std::vector<parse_tree_element<T> >& parse_tree
179     );
180     /*!
181         requires
182             - production_rule_function == a function or function object with the same
183               interface as example_production_rule_function defined above.
184             - It must be possible to store T objects in a std::map.
185         ensures
186             - Uses the CKY algorithm to find the most probable/highest scoring binary parse
187               tree of the given vector of words.
188             - if (#parse_tree.size() == 0) then
189                 - There is no parse tree, using the given production_rules, that can cover
190                   the given word sequence.
191             - else
192                 - #parse_tree == the highest scoring parse tree that covers all the
193                   elements of words.
194                 - #parse_tree[0] == the root node of the parse tree.
195                 - #parse_tree[0].score == the score of the parse tree.  This is the sum of
196                   the scores of all production rules used to construct the tree.
197                 - #parse_tree[0].begin == 0
198                 - #parse_tree[0].end == words.size()
199             - This function uses production_rules() to find out what the allowed production
200               rules are.  That is, production_rules() defines all properties of the grammar
201               used by find_max_parse_cky().
202     !*/
203 
204 // -----------------------------------------------------------------------------------------
205 // -----------------------------------------------------------------------------------------
206 
207     class parse_tree_to_string_error : public error
208     {
209         /*!
210             WHAT THIS OBJECT REPRESENTS
211                 This is the exception thrown by parse_tree_to_string() and
212                 parse_tree_to_string_tagged() if the inputs are discovered to be invalid.
213         !*/
214     };
215 
216 // -----------------------------------------------------------------------------------------
217 
218     template <
219         typename T,
220         typename U
221         >
222     std::string parse_tree_to_string (
223         const std::vector<parse_tree_element<T> >& tree,
224         const std::vector<U>& words,
225         const unsigned long root_idx = 0
226     );
227     /*!
228         requires
229             - It must be possible to print U objects to an ostream using operator<<
230               (typically, U would be something like std::string)
231         ensures
232             - Interprets tree as a parse tree defined over the given sequence of words.
233             - returns a bracketed string that represents the parse tree over the words.
234               For example, suppose the following parse tree is input:
235 
236                         /\
237                        /  \
238                       /\   \
239                      /  \   \
240                    the dog  ran
241 
242               Then the output would be the string "[[the dog] ran]"
243             - Only the sub-tree rooted at tree[root_idx] will be output.  If root_idx >=
244               tree.size() then the empty string is returned.
245         throws
246             - parse_tree_to_string_error
247                 This exception is thrown if an invalid tree is detected.  This might happen
248                 if the tree refers to elements of words that don't exist because words is
249                 shorted than it is supposed to be.
250     !*/
251 
252 // -----------------------------------------------------------------------------------------
253 
254     template <
255         typename T,
256         typename U
257         >
258     std::string parse_tree_to_string_tagged (
259         const std::vector<parse_tree_element<T> >& tree,
260         const std::vector<U>& words,
261         const unsigned long root_idx = 0
262     );
263     /*!
264         requires
265             - It must be possible to print T objects to an ostream using operator<<
266             - It must be possible to print U objects to an ostream using operator<<
267               (typically, U would be something like std::string)
268         ensures
269             - This function does the same thing as parse_tree_to_string() except that it
270               also includes the parse_tree_element::tag object in the output.  Therefore,
271               the tag of each bracket will be included as the first token inside the
272               bracket.  For example, suppose the following parse tree is input (where tags
273               are shown at the vertices):
274 
275                         S
276                         /\
277                       NP  \
278                       /\   \
279                      /  \   \
280                    the dog  ran
281 
282               Then the output would be the string "[S [NP the dog] ran]"
283             - Only the sub-tree rooted at tree[root_idx] will be output.  If root_idx >=
284               tree.size() then the empty string is returned.
285         throws
286             - parse_tree_to_string_error
287                 This exception is thrown if an invalid tree is detected.  This might happen
288                 if the tree refers to elements of words that don't exist because words is
289                 shorted than it is supposed to be.
290     !*/
291 
292 // -----------------------------------------------------------------------------------------
293 
294     template <
295         typename T,
296         typename U
297         >
298     std::string parse_trees_to_string (
299         const std::vector<parse_tree_element<T> >& tree,
300         const std::vector<U>& words,
301         const T& tag_to_skip
302     );
303     /*!
304         requires
305             - It must be possible to print U objects to an ostream using operator<<
306               (typically, U would be something like std::string)
307         ensures
308             - This function behaves just like parse_tree_to_string() except that it will
309               not print the brackets (i.e. []) for the top most parts of the tree which
310               have tags equal to tag_to_skip.  It will however print all the words.
311               Therefore, this function only includes brackets on the subtrees which begin
312               with a tag other than tag_to_skip.
313         throws
314             - parse_tree_to_string_error
315                 This exception is thrown if an invalid tree is detected.  This might happen
316                 if the tree refers to elements of words that don't exist because words is
317                 shorted than it is supposed to be.
318     !*/
319 
320 // -----------------------------------------------------------------------------------------
321 
322     template <
323         typename T,
324         typename U
325         >
326     std::string parse_trees_to_string_tagged (
327         const std::vector<parse_tree_element<T> >& tree,
328         const std::vector<U>& words,
329         const T& tag_to_skip
330     );
331     /*!
332         requires
333             - It must be possible to print T objects to an ostream using operator<<
334             - It must be possible to print U objects to an ostream using operator<<
335               (typically, U would be something like std::string)
336         ensures
337             - This function behaves just like parse_tree_to_string_tagged() except that it
338               will not print the brackets (i.e. []) for the top most parts of the tree
339               which have tags equal to tag_to_skip.  It will however print all the words.
340               Therefore, this function only includes brackets on the subtrees which begin
341               with a tag other than tag_to_skip.
342         throws
343             - parse_tree_to_string_error
344                 This exception is thrown if an invalid tree is detected.  This might happen
345                 if the tree refers to elements of words that don't exist because words is
346                 shorted than it is supposed to be.
347     !*/
348 
349 // -----------------------------------------------------------------------------------------
350 
351     template <
352         typename T
353         >
354     void find_trees_not_rooted_with_tag (
355         const std::vector<parse_tree_element<T> >& tree,
356         const T& tag,
357         std::vector<unsigned long>& tree_roots
358     );
359     /*!
360         requires
361             - objects of type T must be comparable using operator==
362         ensures
363             - Finds all the largest non-overlapping trees in tree that are not rooted with
364               the given tag.
365             - find_trees_not_rooted_with_tag() is useful when you want to cut a parse tree
366               into a bunch of sub-trees and you know that the top level of the tree is all
367               composed of the same kind of tag.  So if you want to just "slice off" the top
368               of the tree where this tag lives then this function is useful for doing that.
369             - #tree_roots.size() == the number of sub-trees found.
370             - for all valid i:
371                 - tree[#tree_roots[i]].tag != tag
372             - To make the operation of this function clearer, here are a few examples of
373               what it will do:
374                 - if (tree[0].tag != tag) then
375                     - #tree_roots.size() == 0
376                     - #tree_roots[0] == 0
377                 - else if (tree[0].tag == tag but its immediate children's tags are not equal to tag) then
378                     - #tree_roots.size() == 2
379                     - #tree_roots[0] == tree[0].left
380                     - #tree_roots[1] == tree[0].right
381     !*/
382 
383 // -----------------------------------------------------------------------------------------
384 
385 }
386 
387 #endif // DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
388 
389