1 /*
2 * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <apertium/transfer_data.h>
19 #include <lttoolbox/compression.h>
20 #include <apertium/utf_converter.h>
21 #include <apertium/apertium_re.h>
22 #include <iostream>
23 #include <apertium/string_utils.h>
24
25 using namespace Apertium;
26 using namespace std;
27
28 void
copy(TransferData const & o)29 TransferData::copy(TransferData const &o)
30 {
31 alphabet = o.alphabet;
32 transducer = o.transducer;
33 final_symbols = o.final_symbols;
34 seen_rules = o.seen_rules;
35 attr_items = o.attr_items;
36 macros = o.macros;
37 lists = o.lists;
38 variables = o.variables;
39 }
40
41 void
destroy()42 TransferData::destroy()
43 {
44 }
45
TransferData()46 TransferData::TransferData()
47 {
48 // adding fixed attr_items
49 attr_items[L"lem"] = L"^(([^<]|\"\\<\")+)";
50 attr_items[L"lemq"] = L"\\#[- _][^<]+";
51 attr_items[L"lemh"] = L"^(([^<#]|\"\\<\"|\"\\#\")+)";
52 attr_items[L"whole"] = L"(.+)";
53 attr_items[L"tags"] = L"((<[^>]+>)+)";
54 attr_items[L"chname"] = L"({([^/]+)\\/)"; // includes delimiters { and / !!!
55 attr_items[L"chcontent"] = L"(\\{.+)";
56 attr_items[L"content"] = L"(\\{.+)";
57 }
58
~TransferData()59 TransferData::~TransferData()
60 {
61 destroy();
62 }
63
TransferData(TransferData const & o)64 TransferData::TransferData(TransferData const &o)
65 {
66 copy(o);
67 }
68
69 TransferData &
operator =(TransferData const & o)70 TransferData::operator =(TransferData const &o)
71 {
72 if(this != &o)
73 {
74 destroy();
75 copy(o);
76 }
77 return *this;
78 }
79
80 Alphabet &
getAlphabet()81 TransferData::getAlphabet()
82 {
83 return alphabet;
84 }
85
86 Transducer &
getTransducer()87 TransferData::getTransducer()
88 {
89 return transducer;
90 }
91
92 map<wstring, wstring, Ltstr> &
getAttrItems()93 TransferData::getAttrItems()
94 {
95 return attr_items;
96 }
97
98 map<wstring, int, Ltstr> &
getMacros()99 TransferData::getMacros()
100 {
101 return macros;
102 }
103
104 map<wstring, set<wstring, Ltstr>, Ltstr> &
getLists()105 TransferData::getLists()
106 {
107 return lists;
108 }
109
110 map<wstring, wstring, Ltstr> &
getVariables()111 TransferData::getVariables()
112 {
113 return variables;
114 }
115
116 int
countToFinalSymbol(const int count)117 TransferData::countToFinalSymbol(const int count) {
118 const wstring count_sym = L"<RULE_NUMBER:" + to_wstring(count) + L">";
119 alphabet.includeSymbol(count_sym);
120 const int symbol = alphabet(count_sym);
121 final_symbols.insert(symbol);
122 return symbol;
123 }
124
125 void
write(FILE * output)126 TransferData::write(FILE *output)
127 {
128 alphabet.write(output);
129
130 transducer.minimize();
131 map<int, double> old_finals = transducer.getFinals(); // copy for later removal
132 map<int, int> finals_rules; // node id -> rule number
133 map<int, multimap<int, pair<int, double> > >& transitions = transducer.getTransitions();
134 // Find all arcs with "final_symbols" in the transitions, let their source node instead be final,
135 // and extract the rule number from the arc. Record relation between source node and rule number
136 // in finals_rules. It is now no longer safe to minimize -- but we already did that.
137 const wstring rule_sym_pre = L"<RULE_NUMBER:"; // see countToFinalSymbol()
138 for(map<int, multimap<int, pair<int, double> > >::const_iterator it = transitions.begin(),
139 limit = transitions.end(); it != limit; ++it)
140 {
141 const int src = it->first;
142 for(multimap<int, pair<int, double> >::const_iterator arc = it->second.begin(),
143 arclimit = it->second.end(); arc != arclimit; ++arc)
144 {
145 const int symbol = arc->first;
146 const int trg = arc->second.first;
147 const double wgt = arc->second.second;
148 if(final_symbols.count(symbol) == 0) {
149 continue;
150 }
151 if(!transducer.isFinal(trg)) {
152 continue;
153 }
154 // Extract the rule number encoded by countToFinalSymbol():
155 wstring s;
156 alphabet.getSymbol(s, symbol);
157 if(s.compare(0, rule_sym_pre.size(), rule_sym_pre) != 0) {
158 continue;
159 }
160 const int rule_num = stoi(s.substr(rule_sym_pre.size()));
161 transducer.setFinal(src, wgt);
162 finals_rules[src] = rule_num;
163 }
164 }
165 // Remove the old finals:
166 for(map<int, double>::const_iterator it = old_finals.begin(), limit = old_finals.end();
167 it != limit; ++it)
168 {
169 transducer.setFinal(it->first, it->second, false);
170 }
171
172 transducer.write(output, alphabet.size());
173
174 // finals_rules
175
176 Compression::multibyte_write(finals_rules.size(), output);
177 for(map<int, int>::const_iterator it = finals_rules.begin(), limit = finals_rules.end();
178 it != limit; it++)
179 {
180 Compression::multibyte_write(it->first, output);
181 Compression::multibyte_write(it->second, output);
182 }
183
184 // attr_items
185
186 // precompiled regexps
187 writeRegexps(output);
188
189 // variables
190 Compression::multibyte_write(variables.size(), output);
191 for(map<wstring, wstring, Ltstr>::const_iterator it = variables.begin(), limit = variables.end();
192 it != limit; it++)
193 {
194 Compression::wstring_write(it->first, output);
195 Compression::wstring_write(it->second, output);
196 }
197
198 // macros
199 Compression::multibyte_write(macros.size(), output);
200 for(map<wstring, int, Ltstr>::const_iterator it = macros.begin(), limit = macros.end();
201 it != limit; it++)
202 {
203 Compression::wstring_write(it->first, output);
204 Compression::multibyte_write(it->second, output);
205 }
206
207 // lists
208 Compression::multibyte_write(lists.size(), output);
209 for(map<wstring, set<wstring, Ltstr>, Ltstr>::const_iterator it = lists.begin(), limit = lists.end();
210 it != limit; it++)
211 {
212 Compression::wstring_write(it->first, output);
213 Compression::multibyte_write(it->second.size(), output);
214
215 for(set<wstring, Ltstr>::const_iterator it2 = it->second.begin(), limit2 = it->second.end();
216 it2 != limit2; it2++)
217 {
218 Compression::wstring_write(*it2, output);
219 }
220 }
221
222 }
223
224 void
writeRegexps(FILE * output)225 TransferData::writeRegexps(FILE *output)
226 {
227 Compression::string_write(string(pcre_version()), output);
228 Compression::multibyte_write(attr_items.size(), output);
229
230 map<wstring, wstring, Ltstr>::iterator it, limit;
231 for(it = attr_items.begin(), limit = attr_items.end(); it != limit; it++)
232 {
233 Compression::wstring_write(it->first, output);
234 ApertiumRE my_re;
235 my_re.compile(UtfConverter::toUtf8(it->second));
236 my_re.write(output);
237 Compression::wstring_write(it->second, output);
238 }
239 }
240