1 /*
2  * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful, but
10  * WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, see <http://www.gnu.org/licenses/>.
16  */
17 #ifndef __TAGGERWORD_H
18 #define __TAGGERWORD_H
19 
20 #include <iostream>
21 #include <map>
22 #include <set>
23 #include <string>
24 #include <vector>
25 
26 #include <lttoolbox/ltstr.h>
27 #include <apertium/ttag.h>
28 #include <apertium/apertium_re.h>
29 
30 using namespace std;
31 
32 /** Class TaggerWord.
33  *  It stores the superficial form and all possible tags that it can receive.
34  *  It has the fine tags delivered by the morphological analyzer and the coarse
35  *  ones used by the PoS tagger.
36  */
37 class TaggerWord{
38 private:
39   wstring superficial_form;
40 
41   set<TTag> tags;  //Set of all possible tags
42   map<TTag, wstring> lexical_forms;  //For a given coarse tag it stores the fine tag
43                                     //delevered by the morphological analyzer
44   wstring ignored_string;
45 
46   bool plus_cut; //Flag to distinguish the way in which the word was ended.
47                   //If it was done by '$' its value should be false
48                   //If it was done by '+' its value should be true
49   bool previous_plus_cut; //Flag to distinguish the way in which the
50 			  //previous word was ended. It has the same
51 			  //plus_cut meaning
52   bool show_sf; // Show the superficial form in the output
53   static map<wstring, ApertiumRE, Ltstr> patterns;
54 
55   bool match(wstring const &s, wstring const &pattern);
56 public:
57   static bool generate_marks;
58   static vector<wstring> array_tags;
59 
60   static bool show_ignored_string;
61 
62    /**
63     * Constructor
64     */
65    TaggerWord(bool prev_plus_cut=false);
66 
67    /**
68     * Copy constructor
69     */
70    TaggerWord(const TaggerWord &w);
71 
72    /**
73     * Destructor
74     */
75    virtual ~TaggerWord();
76 
77    /** Set the superficial form of the word.
78     *  @param s the superficial form
79     */
80    void set_superficial_form(const wstring &s);
81 
82    /** Get the superficial form of the word
83     *
84     */
85    wstring& get_superficial_form();
86 
87    /** Add a new tag to the set of all possible tags of the word.
88     *  @param t the coarse tag
89     *  @param lf the lexical form (fine tag)
90     */
91    virtual void add_tag(TTag &t, const wstring &lf, vector<wstring> const &prefer_rules);
92 
93    /** Get the set of tags of this word.
94     *  @return  set of tags.
95     */
96    virtual set<TTag>& get_tags();
97 
98    /** Get a wstring with the set of tags
99     */
100    virtual wstring get_string_tags();
101 
102   /** Get the lexical form (fine tag) for a given tag (coarse one)
103    *  @param  t the tag
104    *  @return the lexical form of tag t
105    */
106   virtual wstring get_lexical_form(TTag &t, int const TAG_kEOF);
107 
108   wstring get_all_chosen_tag_first(TTag &t, int const TAG_kEOF);
109 
110   /** Get the lexical form (fine tag) for a given tag (coarse one)
111    *  @param  t the tag
112    *  @return the lexical form of tag t without other text that
113    *          is ignored.
114    */
115   wstring get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF);
116 
117   /** Add text to the ignored string
118    *
119    */
120   void add_ignored_string(wstring const &s);
121 
122   /** Set the flag plus_cut to a certain value. If this flag is set to true means
123    *  that there were a '+' between this word and the next one
124    */
125    void set_plus_cut(const bool &c);
126 
127   /**
128    * Get and set the "show superficial form" flag
129    */
130   void set_show_sf(bool sf);
131   bool get_show_sf();
132 
133   /** Get the value of the plus_cut flag */
134   bool get_plus_cut();
135 
136   /** Output operator
137    */
138   friend wostream& operator<< (wostream& os, TaggerWord &w);
139 
140   static void setArrayTags(vector<wstring> const &at);
141 
142   void print();
143 
144   void outputOriginal(FILE *output);
145 
146   bool isAmbiguous() const;  // CAUTION: unknown words are not considered to
147                              // be ambiguous by this method
148 
149   void discardOnAmbiguity(wstring const &tags);
150 };
151 
152 #endif
153