1 /*
2 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <apertium/tagger_utils.h>
19 #include <apertium/file_morpho_stream.h>
20
21 #include <stdio.h>
22 #include <sstream>
23 #include <algorithm>
24 #include <climits>
25 #include <apertium/string_utils.h>
26 #ifdef _MSC_VER
27 #define wcstok wcstok_s
28 #endif
29 #ifdef __MINGW32__
30
_wcstok(wchar_t * wcs,const wchar_t * delim,wchar_t ** ptr)31 wchar_t *_wcstok(wchar_t *wcs, const wchar_t *delim, wchar_t **ptr) {
32 (void)ptr;
33 return wcstok(wcs, delim);
34 }
35
36 #define wcstok _wcstok
37 #endif
38
39 using namespace Apertium;
40
41
fatal_error(wstring const & s)42 void tagger_utils::fatal_error (wstring const &s) {
43 wcerr<<L"Error: "<<s<<L"\n";
44 exit(1);
45 }
46
file_name_error(string const & s)47 void tagger_utils::file_name_error (string const &s) {
48 wcerr << "Error: " << s << endl;
49 exit(1);
50 }
51
itoa(int i)52 char * tagger_utils::itoa(int i) {
53 static char buf[512];
54 sprintf(buf,"%d",i);
55 return buf;
56 }
57
clear_array_double(double a[],int l)58 void tagger_utils::clear_array_double(double a[], int l) {
59 for(int i=0; i<l; i++)
60 a[i]=0.0;
61 }
62
clear_array_vector(vector<TTag> v[],int l)63 void tagger_utils::clear_array_vector(vector<TTag> v[], int l) {
64 for(int i=0; i<l; i++)
65 v[i].clear();
66 }
67
ntokens_multiword(wstring const & s)68 int tagger_utils::ntokens_multiword(wstring const &s)
69 {
70 wchar_t *news = new wchar_t[s.size()+1];
71 wcscpy(news, s.c_str());
72 news[s.size()] = 0;
73 wcerr << news << endl;
74
75 wchar_t const *delim = L"_";
76 wchar_t *ptr;
77 int n=0;
78
79 if (wcstok(news, delim, &ptr))
80 n++;
81 while (wcstok(NULL, delim, &ptr))
82 n++;
83
84 delete[] news;
85
86 return n;
87 }
88
nguiones_fs(wstring const & s)89 int tagger_utils::nguiones_fs(wstring const & s) {
90 wchar_t *news = new wchar_t[s.size()+1];
91 wcscpy(news, s.c_str());
92 news[s.size()] = 0;
93 wcerr << news << endl;
94 wchar_t const *delim = L"-";
95 wchar_t *ptr;
96 int n=0;
97
98 if (wcstok(news, delim, &ptr))
99 n++;
100 while (wcstok(NULL, delim, &ptr))
101 n++;
102
103 delete[] news;
104
105 return n;
106 }
107
trim(wstring s)108 wstring tagger_utils::trim(wstring s)
109 {
110 if (s.length()==0)
111 return L"";
112
113 for (unsigned int i=0; i<(s.length()-1); i++) {
114 if ((s.at(i)==L' ')&&(s.at(i+1)==L' ')) {
115 s.erase(i,1);
116 i--;
117 }
118 }
119
120 if ((s.length()>0)&&(s.at(s.length()-1)==L' '))
121 s.erase(s.length()-1,1);
122 if ((s.length()>0)&&(s.at(0)==L' '))
123 s.erase(0,1);
124
125 return s;
126 }
127
scan_for_ambg_classes(FILE * fdic,TaggerData & td)128 void tagger_utils::scan_for_ambg_classes(FILE *fdic, TaggerData &td) {
129 Collection &output = td.getOutput();
130 FileMorphoStream morpho_stream(fdic, true, &td);
131 tagger_utils::scan_for_ambg_classes(output, morpho_stream);
132 }
133
scan_for_ambg_classes(Collection & output,MorphoStream & morpho_stream)134 void tagger_utils::scan_for_ambg_classes(Collection &output, MorphoStream &morpho_stream) {
135 int nw = 0;
136 set <TTag> tags;
137 TaggerWord *word = NULL;
138
139 // In the input dictionary there must be all punctuation marks, including the end-of-sentence mark
140
141 word = morpho_stream.get_next_word();
142
143 while (word) {
144 if (++nw % 10000 == 0)
145 wcerr << L'.' << flush;
146
147 tags = word->get_tags();
148
149 if (tags.size() > 0)
150 output[tags];
151
152 delete word;
153 word = morpho_stream.get_next_word();
154 }
155 wcerr << L"\n";
156 }
157
158 void
add_neccesary_ambg_classes(TaggerData & td)159 tagger_utils::add_neccesary_ambg_classes(TaggerData &td) {
160 int i;
161 Collection &output = td.getOutput();
162
163 // OPEN AMBIGUITY CLASS
164 // It contains all tags that are not closed.
165 // Unknown words are assigned the open ambiguity class
166 output[td.getOpenClass()];
167
168 // Create ambiguity class holding one single tag for each tag.
169 // If not created yet
170 int N = (td.getTagIndex()).size();
171 for(i = 0; i != N; i++) {
172 set<TTag> amb_class;
173 amb_class.insert(i);
174 output[amb_class];
175 }
176 }
177
178 set<TTag> &
find_similar_ambiguity_class(TaggerData & td,set<TTag> & c)179 tagger_utils::find_similar_ambiguity_class(TaggerData &td, set<TTag> &c) {
180 set<TTag> &ret = td.getOpenClass();
181 Collection &output = td.getOutput();
182 int ret_idx = output[ret];
183
184 for (int k=0; k<output.size(); k++) {
185 const set<TTag> &ambg_class = output[k];
186 if (ambg_class.size() > ret.size() ||
187 (ambg_class.size() == ret.size())) {
188 continue;
189 }
190 if (includes(ambg_class.begin(), ambg_class.end(), c.begin(), c.end())) {
191 ret_idx = k;
192 ret = ambg_class;
193 }
194 }
195 return ret;
196 }
197
198 void
require_ambiguity_class(TaggerData & td,set<TTag> & tags,TaggerWord & word,int nw)199 tagger_utils::require_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word, int nw) {
200 if (td.getOutput().has_not(tags)) {
201 wstring errors;
202 errors = L"A new ambiguity class was found. I cannot continue.\n";
203 errors+= L"Word '" + word.get_superficial_form() + L"' not found in the dictionary.\n";
204 errors+= L"New ambiguity class: " + word.get_string_tags() + L"\n";
205 if (nw >= 0) {
206 std::wostringstream ws;
207 ws << (nw + 1);
208 errors+= L"Line number: " + ws.str() + L"\n";
209 }
210 errors+= L"Take a look at the dictionary, then retrain.";
211 fatal_error(errors);
212 }
213 }
214
_warn_absent_ambiguity_class(TaggerWord & word)215 static void _warn_absent_ambiguity_class(TaggerWord &word) {
216 wstring errors;
217 errors = L"A new ambiguity class was found. \n";
218 errors += L"Retraining the tagger is necessary so as to take it into account.\n";
219 errors += L"Word '" + word.get_superficial_form() + L"'.\n";
220 errors += L"New ambiguity class: " + word.get_string_tags() + L"\n";
221 wcerr << L"Error: " << errors;
222 }
223
224 set<TTag> &
require_similar_ambiguity_class(TaggerData & td,set<TTag> & tags,TaggerWord & word,bool warn)225 tagger_utils::require_similar_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word, bool warn) {
226 if (td.getOutput().has_not(tags)) {
227 if (warn) {
228 _warn_absent_ambiguity_class(word);
229 }
230 return find_similar_ambiguity_class(td, tags);
231 }
232 return tags;
233 }
234
235 set<TTag> &
require_similar_ambiguity_class(TaggerData & td,set<TTag> & tags)236 tagger_utils::require_similar_ambiguity_class(TaggerData &td, set<TTag> &tags) {
237 if (td.getOutput().has_not(tags)) {
238 return find_similar_ambiguity_class(td, tags);
239 }
240 return tags;
241 }
242
243 void
warn_absent_ambiguity_class(TaggerData & td,set<TTag> & tags,TaggerWord & word,bool warn)244 tagger_utils::warn_absent_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word, bool warn) {
245 if (warn && td.getOutput().has_not(tags)) {
246 _warn_absent_ambiguity_class(word);
247 }
248 }
249
250 template <class T>
operator <<(ostream & os,const map<int,T> & f)251 ostream& operator<< (ostream& os, const map <int, T> & f){
252 typename map <int, T>::const_iterator it;
253 os<<f.size();
254 for (it=f.begin(); it!=f.end(); it++)
255 os<<' '<<it->first<<' '<<it->second;
256 return os;
257 }
258
259 template <class T>
operator >>(istream & is,map<int,T> & f)260 istream& operator>> (istream& is, map <int, T> & f) {
261 int n, i, k;
262 f.clear();
263 is>>n;
264 for (k=0; k<n; k++) {
265 is>>i; // warning: does not work if both
266 is>>f[i]; // lines merged in a single one
267 }
268 if (is.bad()) tagger_utils::fatal_error(L"reading map");
269 return is;
270 }
271
272 template <class T>
operator <<(ostream & os,const set<T> & s)273 ostream& operator<< (ostream& os, const set<T>& s) {
274 typename set<T>::iterator it = s.begin();
275 os<<'{';
276 if (it!=s.end()) {
277 os<<*it;
278 while (++it!=s.end()) os<<','<<*it;
279 }
280 os<<'}';
281 return os;
282 }
283
284