dlib/optimization/find_max_parse_cky_abstract.h

// Copyright (C) 2012  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#undef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
#ifdef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_

#include <vector>
#include <string>
#include "../algs.h"
#include "../serialize.h"

namespace dlib
{

// -----------------------------------------------------------------------------------------

    template <
        typename T
        >
    struct constituent
    {
        /*!
            WHAT THIS OBJECT REPRESENTS
                This object represents the linguistic idea of a constituent, that is, a
                group of words that functions as a single unit.  In particular, it
                represents a combination of two constituents into a new constituent.

                Additionally, a constituent object represents a range of words relative to
                some std::vector of words.  The range is from [begin, end) (i.e. including
                begin but not including end, so using the normal C++ iterator notation).
                Moreover, a constituent is always composed of two parts, each having a tag.
                Therefore, the left part is composed of the words in the range [begin,k)
                and has tag left_tag while the right part of the constituent contains the
                words in the range [k,end) and has the tag right_tag.

                The tags are user defined objects of type T.  In general, they are used to
                represent syntactic categories such as noun phrase, verb phrase, etc.
        !*/

        unsigned long begin, end, k;
        T left_tag;
        T right_tag;
    };

    template <
        typename T
        >
    void serialize(
        const constituent<T>& item,
        std::ostream& out
    );
    /*!
        provides serialization support
    !*/

    template <
        typename T
        >
    void deserialize(
        constituent<T>& item,
        std::istream& in
    );
    /*!
        provides deserialization support
    !*/

// -----------------------------------------------------------------------------------------

    /*!A END_OF_TREE is used to indicate that parse_tree_element::left or
         parse_tree_element::right doesn't point to another subtree.
    !*/
    const unsigned long END_OF_TREE = 0xFFFFFFFF;

// -----------------------------------------------------------------------------------------

    template <
        typename T
        >
    struct parse_tree_element
    {
        /*!
            WHAT THIS OBJECT REPRESENTS
                This object is used to represent a node in a binary parse tree.  An entire
                parse tree is represented by a std::vector of parse_tree_element objects.
                We follow the convention that the first element of this vector is always
                the root of the entire tree.

                The fields of this object have the following interpretations:
                    - c == the constituent spanned by this node in the parse tree.
                      Therefore, the node spans the words in the range [c.begin, c.end).
                    - tag == the syntactic category of this node in the parse tree.
                    - score == the score or log likelihood for this parse tree.  In
                      general, this is the sum of scores of all the production rules used
                      to build the tree rooted at the current node.
                    - let PT denote the vector of parse_tree_elements that defines an
                      entire parse tree.  Then we have:
                        - if (left != END_OF_TREE) then
                            - PT[left] == the left sub-tree of the current node.
                            - PT[left] spans the words [c.begin, c.k)
                            - PT[left].tag == c.left_tag
                        - else
                            - there is no left sub-tree

                        - if (right != END_OF_TREE) then
                            - PT[right] == the right sub-tree of the current node.
                            - PT[right] spans the words [c.k, c.end)
                            - PT[right].tag == c.right_tag
                        - else
                            - there is no right sub-tree
        !*/

        constituent<T> c;
        T tag;
        double score;

        unsigned long left;
        unsigned long right;
    };

    template <
        typename T
        >
    void serialize (
        const parse_tree_element<T>& item,
        std::ostream& out
    );
    /*!
        provides serialization support
    !*/

    template <
        typename T
        >
    void deserialize (
        parse_tree_element<T>& item,
        std::istream& in
    );
    /*!
        provides deserialization support
    !*/

// -----------------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------------

    void example_production_rule_function (
        const std::vector<T>& words,
        const constituent<T>& c,
        std::vector<std::pair<T,double> >& possible_tags
    )
    /*!
        requires
            - 0 <= c.begin < c.k < c.end <= words.size()
            - possible_tags.size() == 0
        ensures
            - Finds all the syntactic categories that can be used to label c and puts those
              categories, along with their scores, into possible_tags.  Or in other words,
              this function determines which production rules can be used to turn the left
              and right sub-constituents in c into a single constituent.  The contents of c
              have the following interpretations:
                - The left sub-constituent has syntactic category c.left_tag
                - for all i such that c.begin <= i < c.k:
                    - words[i] is part of the left sub-constituent.
                - The right sub-constituent has syntactic category c.right_tag
                - for all i such that c.k <= i < c.end:
                    - words[i] is part of the right sub-constituent.

            - Note that example_production_rule_function() is not a real function.  It is
              here just to show you how to define production rule producing functions for
              use with the find_max_parse_cky() routine defined below.
    !*/

    template <
        typename T,
        typename production_rule_function
        >
    void find_max_parse_cky (
        const std::vector<T>& words,
        const production_rule_function& production_rules,
        std::vector<parse_tree_element<T> >& parse_tree
    );
    /*!
        requires
            - production_rule_function == a function or function object with the same
              interface as example_production_rule_function defined above.
            - It must be possible to store T objects in a std::map.
        ensures
            - Uses the CKY algorithm to find the most probable/highest scoring binary parse
              tree of the given vector of words.
            - if (#parse_tree.size() == 0) then
                - There is no parse tree, using the given production_rules, that can cover
                  the given word sequence.
            - else
                - #parse_tree == the highest scoring parse tree that covers all the
                  elements of words.
                - #parse_tree[0] == the root node of the parse tree.
                - #parse_tree[0].score == the score of the parse tree.  This is the sum of
                  the scores of all production rules used to construct the tree.
                - #parse_tree[0].begin == 0
                - #parse_tree[0].end == words.size()
            - This function uses production_rules() to find out what the allowed production
              rules are.  That is, production_rules() defines all properties of the grammar
              used by find_max_parse_cky().
    !*/

// -----------------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------------

    class parse_tree_to_string_error : public error
    {
        /*!
            WHAT THIS OBJECT REPRESENTS
                This is the exception thrown by parse_tree_to_string() and
                parse_tree_to_string_tagged() if the inputs are discovered to be invalid.
        !*/
    };

// -----------------------------------------------------------------------------------------

    template <
        typename T,
        typename U
        >
    std::string parse_tree_to_string (
        const std::vector<parse_tree_element<T> >& tree,
        const std::vector<U>& words,
        const unsigned long root_idx = 0
    );
    /*!
        requires
            - It must be possible to print U objects to an ostream using operator<<
              (typically, U would be something like std::string)
        ensures
            - Interprets tree as a parse tree defined over the given sequence of words.
            - returns a bracketed string that represents the parse tree over the words.
              For example, suppose the following parse tree is input:

                        /\
                       /  \
                      /\   \
                     /  \   \
                   the dog  ran

              Then the output would be the string "[[the dog] ran]"
            - Only the sub-tree rooted at tree[root_idx] will be output.  If root_idx >=
              tree.size() then the empty string is returned.
        throws
            - parse_tree_to_string_error
                This exception is thrown if an invalid tree is detected.  This might happen
                if the tree refers to elements of words that don't exist because words is
                shorted than it is supposed to be.
    !*/

// -----------------------------------------------------------------------------------------

    template <
        typename T,
        typename U
        >
    std::string parse_tree_to_string_tagged (
        const std::vector<parse_tree_element<T> >& tree,
        const std::vector<U>& words,
        const unsigned long root_idx = 0
    );
    /*!
        requires
            - It must be possible to print T objects to an ostream using operator<<
            - It must be possible to print U objects to an ostream using operator<<
              (typically, U would be something like std::string)
        ensures
            - This function does the same thing as parse_tree_to_string() except that it
              also includes the parse_tree_element::tag object in the output.  Therefore,
              the tag of each bracket will be included as the first token inside the
              bracket.  For example, suppose the following parse tree is input (where tags
              are shown at the vertices):

                        S
                        /\
                      NP  \
                      /\   \
                     /  \   \
                   the dog  ran

              Then the output would be the string "[S [NP the dog] ran]"
            - Only the sub-tree rooted at tree[root_idx] will be output.  If root_idx >=
              tree.size() then the empty string is returned.
        throws
            - parse_tree_to_string_error
                This exception is thrown if an invalid tree is detected.  This might happen
                if the tree refers to elements of words that don't exist because words is
                shorted than it is supposed to be.
    !*/

// -----------------------------------------------------------------------------------------

    template <
        typename T,
        typename U
        >
    std::string parse_trees_to_string (
        const std::vector<parse_tree_element<T> >& tree,
        const std::vector<U>& words,
        const T& tag_to_skip
    );
    /*!
        requires
            - It must be possible to print U objects to an ostream using operator<<
              (typically, U would be something like std::string)
        ensures
            - This function behaves just like parse_tree_to_string() except that it will
              not print the brackets (i.e. []) for the top most parts of the tree which
              have tags equal to tag_to_skip.  It will however print all the words.
              Therefore, this function only includes brackets on the subtrees which begin
              with a tag other than tag_to_skip.
        throws
            - parse_tree_to_string_error
                This exception is thrown if an invalid tree is detected.  This might happen
                if the tree refers to elements of words that don't exist because words is
                shorted than it is supposed to be.
    !*/

// -----------------------------------------------------------------------------------------

    template <
        typename T,
        typename U
        >
    std::string parse_trees_to_string_tagged (
        const std::vector<parse_tree_element<T> >& tree,
        const std::vector<U>& words,
        const T& tag_to_skip
    );
    /*!
        requires
            - It must be possible to print T objects to an ostream using operator<<
            - It must be possible to print U objects to an ostream using operator<<
              (typically, U would be something like std::string)
        ensures
            - This function behaves just like parse_tree_to_string_tagged() except that it
              will not print the brackets (i.e. []) for the top most parts of the tree
              which have tags equal to tag_to_skip.  It will however print all the words.
              Therefore, this function only includes brackets on the subtrees which begin
              with a tag other than tag_to_skip.
        throws
            - parse_tree_to_string_error
                This exception is thrown if an invalid tree is detected.  This might happen
                if the tree refers to elements of words that don't exist because words is
                shorted than it is supposed to be.
    !*/

// -----------------------------------------------------------------------------------------

    template <
        typename T
        >
    void find_trees_not_rooted_with_tag (
        const std::vector<parse_tree_element<T> >& tree,
        const T& tag,
        std::vector<unsigned long>& tree_roots
    );
    /*!
        requires
            - objects of type T must be comparable using operator==
        ensures
            - Finds all the largest non-overlapping trees in tree that are not rooted with
              the given tag.
            - find_trees_not_rooted_with_tag() is useful when you want to cut a parse tree
              into a bunch of sub-trees and you know that the top level of the tree is all
              composed of the same kind of tag.  So if you want to just "slice off" the top
              of the tree where this tag lives then this function is useful for doing that.
            - #tree_roots.size() == the number of sub-trees found.
            - for all valid i:
                - tree[#tree_roots[i]].tag != tag
            - To make the operation of this function clearer, here are a few examples of
              what it will do:
                - if (tree[0].tag != tag) then
                    - #tree_roots.size() == 0
                    - #tree_roots[0] == 0
                - else if (tree[0].tag == tag but its immediate children's tags are not equal to tag) then
                    - #tree_roots.size() == 2
                    - #tree_roots[0] == tree[0].left
                    - #tree_roots[1] == tree[0].right
    !*/

// -----------------------------------------------------------------------------------------

}

#endif // DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_