1 // Copyright (C) 2012 Davis E. King (davis@dlib.net) 2 // License: Boost Software License See LICENSE.txt for the full license. 3 #undef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_ 4 #ifdef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_ 5 6 #include <vector> 7 #include <string> 8 #include "../algs.h" 9 #include "../serialize.h" 10 11 namespace dlib 12 { 13 14 // ----------------------------------------------------------------------------------------- 15 16 template < 17 typename T 18 > 19 struct constituent 20 { 21 /*! 22 WHAT THIS OBJECT REPRESENTS 23 This object represents the linguistic idea of a constituent, that is, a 24 group of words that functions as a single unit. In particular, it 25 represents a combination of two constituents into a new constituent. 26 27 Additionally, a constituent object represents a range of words relative to 28 some std::vector of words. The range is from [begin, end) (i.e. including 29 begin but not including end, so using the normal C++ iterator notation). 30 Moreover, a constituent is always composed of two parts, each having a tag. 31 Therefore, the left part is composed of the words in the range [begin,k) 32 and has tag left_tag while the right part of the constituent contains the 33 words in the range [k,end) and has the tag right_tag. 34 35 The tags are user defined objects of type T. In general, they are used to 36 represent syntactic categories such as noun phrase, verb phrase, etc. 37 !*/ 38 39 unsigned long begin, end, k; 40 T left_tag; 41 T right_tag; 42 }; 43 44 template < 45 typename T 46 > 47 void serialize( 48 const constituent<T>& item, 49 std::ostream& out 50 ); 51 /*! 52 provides serialization support 53 !*/ 54 55 template < 56 typename T 57 > 58 void deserialize( 59 constituent<T>& item, 60 std::istream& in 61 ); 62 /*! 63 provides deserialization support 64 !*/ 65 66 // ----------------------------------------------------------------------------------------- 67 68 /*!A END_OF_TREE is used to indicate that parse_tree_element::left or 69 parse_tree_element::right doesn't point to another subtree. 70 !*/ 71 const unsigned long END_OF_TREE = 0xFFFFFFFF; 72 73 // ----------------------------------------------------------------------------------------- 74 75 template < 76 typename T 77 > 78 struct parse_tree_element 79 { 80 /*! 81 WHAT THIS OBJECT REPRESENTS 82 This object is used to represent a node in a binary parse tree. An entire 83 parse tree is represented by a std::vector of parse_tree_element objects. 84 We follow the convention that the first element of this vector is always 85 the root of the entire tree. 86 87 The fields of this object have the following interpretations: 88 - c == the constituent spanned by this node in the parse tree. 89 Therefore, the node spans the words in the range [c.begin, c.end). 90 - tag == the syntactic category of this node in the parse tree. 91 - score == the score or log likelihood for this parse tree. In 92 general, this is the sum of scores of all the production rules used 93 to build the tree rooted at the current node. 94 - let PT denote the vector of parse_tree_elements that defines an 95 entire parse tree. Then we have: 96 - if (left != END_OF_TREE) then 97 - PT[left] == the left sub-tree of the current node. 98 - PT[left] spans the words [c.begin, c.k) 99 - PT[left].tag == c.left_tag 100 - else 101 - there is no left sub-tree 102 103 - if (right != END_OF_TREE) then 104 - PT[right] == the right sub-tree of the current node. 105 - PT[right] spans the words [c.k, c.end) 106 - PT[right].tag == c.right_tag 107 - else 108 - there is no right sub-tree 109 !*/ 110 111 constituent<T> c; 112 T tag; 113 double score; 114 115 unsigned long left; 116 unsigned long right; 117 }; 118 119 template < 120 typename T 121 > 122 void serialize ( 123 const parse_tree_element<T>& item, 124 std::ostream& out 125 ); 126 /*! 127 provides serialization support 128 !*/ 129 130 template < 131 typename T 132 > 133 void deserialize ( 134 parse_tree_element<T>& item, 135 std::istream& in 136 ); 137 /*! 138 provides deserialization support 139 !*/ 140 141 // ----------------------------------------------------------------------------------------- 142 // ----------------------------------------------------------------------------------------- 143 144 void example_production_rule_function ( 145 const std::vector<T>& words, 146 const constituent<T>& c, 147 std::vector<std::pair<T,double> >& possible_tags 148 ) 149 /*! 150 requires 151 - 0 <= c.begin < c.k < c.end <= words.size() 152 - possible_tags.size() == 0 153 ensures 154 - Finds all the syntactic categories that can be used to label c and puts those 155 categories, along with their scores, into possible_tags. Or in other words, 156 this function determines which production rules can be used to turn the left 157 and right sub-constituents in c into a single constituent. The contents of c 158 have the following interpretations: 159 - The left sub-constituent has syntactic category c.left_tag 160 - for all i such that c.begin <= i < c.k: 161 - words[i] is part of the left sub-constituent. 162 - The right sub-constituent has syntactic category c.right_tag 163 - for all i such that c.k <= i < c.end: 164 - words[i] is part of the right sub-constituent. 165 166 - Note that example_production_rule_function() is not a real function. It is 167 here just to show you how to define production rule producing functions for 168 use with the find_max_parse_cky() routine defined below. 169 !*/ 170 171 template < 172 typename T, 173 typename production_rule_function 174 > 175 void find_max_parse_cky ( 176 const std::vector<T>& words, 177 const production_rule_function& production_rules, 178 std::vector<parse_tree_element<T> >& parse_tree 179 ); 180 /*! 181 requires 182 - production_rule_function == a function or function object with the same 183 interface as example_production_rule_function defined above. 184 - It must be possible to store T objects in a std::map. 185 ensures 186 - Uses the CKY algorithm to find the most probable/highest scoring binary parse 187 tree of the given vector of words. 188 - if (#parse_tree.size() == 0) then 189 - There is no parse tree, using the given production_rules, that can cover 190 the given word sequence. 191 - else 192 - #parse_tree == the highest scoring parse tree that covers all the 193 elements of words. 194 - #parse_tree[0] == the root node of the parse tree. 195 - #parse_tree[0].score == the score of the parse tree. This is the sum of 196 the scores of all production rules used to construct the tree. 197 - #parse_tree[0].begin == 0 198 - #parse_tree[0].end == words.size() 199 - This function uses production_rules() to find out what the allowed production 200 rules are. That is, production_rules() defines all properties of the grammar 201 used by find_max_parse_cky(). 202 !*/ 203 204 // ----------------------------------------------------------------------------------------- 205 // ----------------------------------------------------------------------------------------- 206 207 class parse_tree_to_string_error : public error 208 { 209 /*! 210 WHAT THIS OBJECT REPRESENTS 211 This is the exception thrown by parse_tree_to_string() and 212 parse_tree_to_string_tagged() if the inputs are discovered to be invalid. 213 !*/ 214 }; 215 216 // ----------------------------------------------------------------------------------------- 217 218 template < 219 typename T, 220 typename U 221 > 222 std::string parse_tree_to_string ( 223 const std::vector<parse_tree_element<T> >& tree, 224 const std::vector<U>& words, 225 const unsigned long root_idx = 0 226 ); 227 /*! 228 requires 229 - It must be possible to print U objects to an ostream using operator<< 230 (typically, U would be something like std::string) 231 ensures 232 - Interprets tree as a parse tree defined over the given sequence of words. 233 - returns a bracketed string that represents the parse tree over the words. 234 For example, suppose the following parse tree is input: 235 236 /\ 237 / \ 238 /\ \ 239 / \ \ 240 the dog ran 241 242 Then the output would be the string "[[the dog] ran]" 243 - Only the sub-tree rooted at tree[root_idx] will be output. If root_idx >= 244 tree.size() then the empty string is returned. 245 throws 246 - parse_tree_to_string_error 247 This exception is thrown if an invalid tree is detected. This might happen 248 if the tree refers to elements of words that don't exist because words is 249 shorted than it is supposed to be. 250 !*/ 251 252 // ----------------------------------------------------------------------------------------- 253 254 template < 255 typename T, 256 typename U 257 > 258 std::string parse_tree_to_string_tagged ( 259 const std::vector<parse_tree_element<T> >& tree, 260 const std::vector<U>& words, 261 const unsigned long root_idx = 0 262 ); 263 /*! 264 requires 265 - It must be possible to print T objects to an ostream using operator<< 266 - It must be possible to print U objects to an ostream using operator<< 267 (typically, U would be something like std::string) 268 ensures 269 - This function does the same thing as parse_tree_to_string() except that it 270 also includes the parse_tree_element::tag object in the output. Therefore, 271 the tag of each bracket will be included as the first token inside the 272 bracket. For example, suppose the following parse tree is input (where tags 273 are shown at the vertices): 274 275 S 276 /\ 277 NP \ 278 /\ \ 279 / \ \ 280 the dog ran 281 282 Then the output would be the string "[S [NP the dog] ran]" 283 - Only the sub-tree rooted at tree[root_idx] will be output. If root_idx >= 284 tree.size() then the empty string is returned. 285 throws 286 - parse_tree_to_string_error 287 This exception is thrown if an invalid tree is detected. This might happen 288 if the tree refers to elements of words that don't exist because words is 289 shorted than it is supposed to be. 290 !*/ 291 292 // ----------------------------------------------------------------------------------------- 293 294 template < 295 typename T, 296 typename U 297 > 298 std::string parse_trees_to_string ( 299 const std::vector<parse_tree_element<T> >& tree, 300 const std::vector<U>& words, 301 const T& tag_to_skip 302 ); 303 /*! 304 requires 305 - It must be possible to print U objects to an ostream using operator<< 306 (typically, U would be something like std::string) 307 ensures 308 - This function behaves just like parse_tree_to_string() except that it will 309 not print the brackets (i.e. []) for the top most parts of the tree which 310 have tags equal to tag_to_skip. It will however print all the words. 311 Therefore, this function only includes brackets on the subtrees which begin 312 with a tag other than tag_to_skip. 313 throws 314 - parse_tree_to_string_error 315 This exception is thrown if an invalid tree is detected. This might happen 316 if the tree refers to elements of words that don't exist because words is 317 shorted than it is supposed to be. 318 !*/ 319 320 // ----------------------------------------------------------------------------------------- 321 322 template < 323 typename T, 324 typename U 325 > 326 std::string parse_trees_to_string_tagged ( 327 const std::vector<parse_tree_element<T> >& tree, 328 const std::vector<U>& words, 329 const T& tag_to_skip 330 ); 331 /*! 332 requires 333 - It must be possible to print T objects to an ostream using operator<< 334 - It must be possible to print U objects to an ostream using operator<< 335 (typically, U would be something like std::string) 336 ensures 337 - This function behaves just like parse_tree_to_string_tagged() except that it 338 will not print the brackets (i.e. []) for the top most parts of the tree 339 which have tags equal to tag_to_skip. It will however print all the words. 340 Therefore, this function only includes brackets on the subtrees which begin 341 with a tag other than tag_to_skip. 342 throws 343 - parse_tree_to_string_error 344 This exception is thrown if an invalid tree is detected. This might happen 345 if the tree refers to elements of words that don't exist because words is 346 shorted than it is supposed to be. 347 !*/ 348 349 // ----------------------------------------------------------------------------------------- 350 351 template < 352 typename T 353 > 354 void find_trees_not_rooted_with_tag ( 355 const std::vector<parse_tree_element<T> >& tree, 356 const T& tag, 357 std::vector<unsigned long>& tree_roots 358 ); 359 /*! 360 requires 361 - objects of type T must be comparable using operator== 362 ensures 363 - Finds all the largest non-overlapping trees in tree that are not rooted with 364 the given tag. 365 - find_trees_not_rooted_with_tag() is useful when you want to cut a parse tree 366 into a bunch of sub-trees and you know that the top level of the tree is all 367 composed of the same kind of tag. So if you want to just "slice off" the top 368 of the tree where this tag lives then this function is useful for doing that. 369 - #tree_roots.size() == the number of sub-trees found. 370 - for all valid i: 371 - tree[#tree_roots[i]].tag != tag 372 - To make the operation of this function clearer, here are a few examples of 373 what it will do: 374 - if (tree[0].tag != tag) then 375 - #tree_roots.size() == 0 376 - #tree_roots[0] == 0 377 - else if (tree[0].tag == tag but its immediate children's tags are not equal to tag) then 378 - #tree_roots.size() == 2 379 - #tree_roots[0] == tree[0].left 380 - #tree_roots[1] == tree[0].right 381 !*/ 382 383 // ----------------------------------------------------------------------------------------- 384 385 } 386 387 #endif // DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_ 388 389