1 /*
2  * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful, but
10  * WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <apertium/tagger_utils.h>
19 #include <apertium/file_morpho_stream.h>
20 
21 #include <stdio.h>
22 #include <sstream>
23 #include <algorithm>
24 #include <climits>
25 #include <apertium/string_utils.h>
26 #ifdef _MSC_VER
27 #define wcstok wcstok_s
28 #endif
29 #ifdef __MINGW32__
30 
_wcstok(wchar_t * wcs,const wchar_t * delim,wchar_t ** ptr)31 wchar_t *_wcstok(wchar_t *wcs, const wchar_t *delim, wchar_t **ptr) {
32   (void)ptr;
33   return wcstok(wcs, delim);
34 }
35 
36 #define wcstok _wcstok
37 #endif
38 
39 using namespace Apertium;
40 
41 
fatal_error(wstring const & s)42 void tagger_utils::fatal_error (wstring const &s) {
43   wcerr<<L"Error: "<<s<<L"\n";
44   exit(1);
45 }
46 
file_name_error(string const & s)47 void tagger_utils::file_name_error (string const &s) {
48   wcerr << "Error: " << s << endl;
49   exit(1);
50 }
51 
itoa(int i)52 char * tagger_utils::itoa(int i) {
53   static char buf[512];
54   sprintf(buf,"%d",i);
55   return buf;
56 }
57 
clear_array_double(double a[],int l)58 void tagger_utils::clear_array_double(double a[], int l) {
59   for(int i=0; i<l; i++)
60     a[i]=0.0;
61 }
62 
clear_array_vector(vector<TTag> v[],int l)63 void tagger_utils::clear_array_vector(vector<TTag> v[], int l) {
64   for(int i=0; i<l; i++)
65     v[i].clear();
66 }
67 
ntokens_multiword(wstring const & s)68 int tagger_utils::ntokens_multiword(wstring const &s)
69 {
70    wchar_t *news = new wchar_t[s.size()+1];
71    wcscpy(news, s.c_str());
72    news[s.size()] = 0;
73    wcerr << news << endl;
74 
75    wchar_t const *delim = L"_";
76    wchar_t *ptr;
77    int n=0;
78 
79    if (wcstok(news, delim, &ptr))
80      n++;
81    while (wcstok(NULL, delim, &ptr))
82      n++;
83 
84    delete[] news;
85 
86    return n;
87 }
88 
nguiones_fs(wstring const & s)89 int tagger_utils::nguiones_fs(wstring const & s) {
90    wchar_t *news = new wchar_t[s.size()+1];
91    wcscpy(news, s.c_str());
92    news[s.size()] = 0;
93    wcerr << news << endl;
94    wchar_t const *delim = L"-";
95    wchar_t *ptr;
96    int n=0;
97 
98    if (wcstok(news, delim, &ptr))
99      n++;
100    while (wcstok(NULL, delim, &ptr))
101      n++;
102 
103    delete[] news;
104 
105    return n;
106 }
107 
trim(wstring s)108 wstring tagger_utils::trim(wstring s)
109 {
110   if (s.length()==0)
111     return L"";
112 
113   for (unsigned int i=0; i<(s.length()-1); i++) {
114     if ((s.at(i)==L' ')&&(s.at(i+1)==L' ')) {
115       s.erase(i,1);
116       i--;
117     }
118   }
119 
120   if ((s.length()>0)&&(s.at(s.length()-1)==L' '))
121     s.erase(s.length()-1,1);
122   if ((s.length()>0)&&(s.at(0)==L' '))
123     s.erase(0,1);
124 
125   return s;
126 }
127 
scan_for_ambg_classes(FILE * fdic,TaggerData & td)128 void tagger_utils::scan_for_ambg_classes(FILE *fdic, TaggerData &td) {
129   Collection &output = td.getOutput();
130   FileMorphoStream morpho_stream(fdic, true, &td);
131   tagger_utils::scan_for_ambg_classes(output, morpho_stream);
132 }
133 
scan_for_ambg_classes(Collection & output,MorphoStream & morpho_stream)134 void tagger_utils::scan_for_ambg_classes(Collection &output, MorphoStream &morpho_stream) {
135   int nw = 0;
136   set <TTag> tags;
137   TaggerWord *word = NULL;
138 
139   // In the input dictionary there must be all punctuation marks, including the end-of-sentence mark
140 
141   word = morpho_stream.get_next_word();
142 
143   while (word) {
144     if (++nw % 10000 == 0)
145       wcerr << L'.' << flush;
146 
147     tags = word->get_tags();
148 
149     if (tags.size() > 0)
150       output[tags];
151 
152     delete word;
153     word = morpho_stream.get_next_word();
154   }
155   wcerr << L"\n";
156 }
157 
158 void
add_neccesary_ambg_classes(TaggerData & td)159 tagger_utils::add_neccesary_ambg_classes(TaggerData &td) {
160   int i;
161   Collection &output = td.getOutput();
162 
163   // OPEN AMBIGUITY CLASS
164   // It contains all tags that are not closed.
165   // Unknown words are assigned the open ambiguity class
166   output[td.getOpenClass()];
167 
168   // Create ambiguity class holding one single tag for each tag.
169   // If not created yet
170   int N = (td.getTagIndex()).size();
171   for(i = 0; i != N; i++) {
172     set<TTag> amb_class;
173     amb_class.insert(i);
174     output[amb_class];
175   }
176 }
177 
178 set<TTag> &
find_similar_ambiguity_class(TaggerData & td,set<TTag> & c)179 tagger_utils::find_similar_ambiguity_class(TaggerData &td, set<TTag> &c) {
180   set<TTag> &ret = td.getOpenClass();
181   Collection &output = td.getOutput();
182   int ret_idx = output[ret];
183 
184   for (int k=0; k<output.size(); k++) {
185     const set<TTag> &ambg_class = output[k];
186     if (ambg_class.size() > ret.size() ||
187         (ambg_class.size() == ret.size())) {
188       continue;
189     }
190     if (includes(ambg_class.begin(), ambg_class.end(), c.begin(), c.end())) {
191       ret_idx = k;
192       ret = ambg_class;
193     }
194   }
195   return ret;
196 }
197 
198 void
require_ambiguity_class(TaggerData & td,set<TTag> & tags,TaggerWord & word,int nw)199 tagger_utils::require_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word, int nw) {
200   if (td.getOutput().has_not(tags)) {
201     wstring errors;
202     errors = L"A new ambiguity class was found. I cannot continue.\n";
203     errors+= L"Word '" + word.get_superficial_form() + L"' not found in the dictionary.\n";
204     errors+= L"New ambiguity class: " + word.get_string_tags() + L"\n";
205     if (nw >= 0) {
206       std::wostringstream ws;
207       ws << (nw + 1);
208       errors+= L"Line number: " + ws.str() + L"\n";
209     }
210     errors+= L"Take a look at the dictionary, then retrain.";
211     fatal_error(errors);
212   }
213 }
214 
_warn_absent_ambiguity_class(TaggerWord & word)215 static void _warn_absent_ambiguity_class(TaggerWord &word) {
216   wstring errors;
217   errors = L"A new ambiguity class was found. \n";
218   errors += L"Retraining the tagger is necessary so as to take it into account.\n";
219   errors += L"Word '" + word.get_superficial_form() + L"'.\n";
220   errors += L"New ambiguity class: " + word.get_string_tags() + L"\n";
221   wcerr << L"Error: " << errors;
222 }
223 
224 set<TTag> &
require_similar_ambiguity_class(TaggerData & td,set<TTag> & tags,TaggerWord & word,bool warn)225 tagger_utils::require_similar_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word, bool warn) {
226   if (td.getOutput().has_not(tags)) {
227     if (warn) {
228       _warn_absent_ambiguity_class(word);
229     }
230     return find_similar_ambiguity_class(td, tags);
231   }
232   return tags;
233 }
234 
235 set<TTag> &
require_similar_ambiguity_class(TaggerData & td,set<TTag> & tags)236 tagger_utils::require_similar_ambiguity_class(TaggerData &td, set<TTag> &tags) {
237   if (td.getOutput().has_not(tags)) {
238     return find_similar_ambiguity_class(td, tags);
239   }
240   return tags;
241 }
242 
243 void
warn_absent_ambiguity_class(TaggerData & td,set<TTag> & tags,TaggerWord & word,bool warn)244 tagger_utils::warn_absent_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word, bool warn) {
245   if (warn && td.getOutput().has_not(tags)) {
246     _warn_absent_ambiguity_class(word);
247   }
248 }
249 
250 template <class T>
operator <<(ostream & os,const map<int,T> & f)251 ostream& operator<< (ostream& os, const map <int, T> & f){
252   typename map <int, T>::const_iterator it;
253   os<<f.size();
254   for (it=f.begin(); it!=f.end(); it++)
255     os<<' '<<it->first<<' '<<it->second;
256   return os;
257 }
258 
259 template <class T>
operator >>(istream & is,map<int,T> & f)260 istream& operator>> (istream& is, map <int, T> & f) {
261   int n, i, k;
262   f.clear();
263   is>>n;
264   for (k=0; k<n; k++) {
265     is>>i;     // warning: does not work if both
266     is>>f[i];  // lines merged in a single one
267   }
268   if (is.bad()) tagger_utils::fatal_error(L"reading map");
269   return is;
270 }
271 
272 template <class T>
operator <<(ostream & os,const set<T> & s)273 ostream& operator<< (ostream& os, const set<T>& s) {
274   typename set<T>::iterator it = s.begin();
275   os<<'{';
276   if (it!=s.end()) {
277     os<<*it;
278     while (++it!=s.end()) os<<','<<*it;
279   }
280   os<<'}';
281   return os;
282 }
283 
284