1 /*
2  * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful, but
10  * WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <apertium/transfer_data.h>
19 #include <lttoolbox/compression.h>
20 #include <apertium/utf_converter.h>
21 #include <apertium/apertium_re.h>
22 #include <iostream>
23 #include <apertium/string_utils.h>
24 
25 using namespace Apertium;
26 using namespace std;
27 
28 void
copy(TransferData const & o)29 TransferData::copy(TransferData const &o)
30 {
31   alphabet = o.alphabet;
32   transducer = o.transducer;
33   final_symbols = o.final_symbols;
34   seen_rules = o.seen_rules;
35   attr_items = o.attr_items;
36   macros = o.macros;
37   lists = o.lists;
38   variables = o.variables;
39 }
40 
41 void
destroy()42 TransferData::destroy()
43 {
44 }
45 
TransferData()46 TransferData::TransferData()
47 {
48   // adding fixed attr_items
49   attr_items[L"lem"] = L"^(([^<]|\"\\<\")+)";
50   attr_items[L"lemq"] = L"\\#[- _][^<]+";
51   attr_items[L"lemh"] = L"^(([^<#]|\"\\<\"|\"\\#\")+)";
52   attr_items[L"whole"] = L"(.+)";
53   attr_items[L"tags"] = L"((<[^>]+>)+)";
54   attr_items[L"chname"] = L"({([^/]+)\\/)"; // includes delimiters { and / !!!
55   attr_items[L"chcontent"] = L"(\\{.+)";
56   attr_items[L"content"] = L"(\\{.+)";
57 }
58 
~TransferData()59 TransferData::~TransferData()
60 {
61   destroy();
62 }
63 
TransferData(TransferData const & o)64 TransferData::TransferData(TransferData const &o)
65 {
66   copy(o);
67 }
68 
69 TransferData &
operator =(TransferData const & o)70 TransferData::operator =(TransferData const &o)
71 {
72   if(this != &o)
73   {
74     destroy();
75     copy(o);
76   }
77   return *this;
78 }
79 
80 Alphabet &
getAlphabet()81 TransferData::getAlphabet()
82 {
83   return alphabet;
84 }
85 
86 Transducer &
getTransducer()87 TransferData::getTransducer()
88 {
89   return transducer;
90 }
91 
92 map<wstring, wstring, Ltstr> &
getAttrItems()93 TransferData::getAttrItems()
94 {
95   return attr_items;
96 }
97 
98 map<wstring, int, Ltstr> &
getMacros()99 TransferData::getMacros()
100 {
101   return macros;
102 }
103 
104 map<wstring, set<wstring, Ltstr>, Ltstr> &
getLists()105 TransferData::getLists()
106 {
107   return lists;
108 }
109 
110 map<wstring, wstring, Ltstr> &
getVariables()111 TransferData::getVariables()
112 {
113   return variables;
114 }
115 
116 int
countToFinalSymbol(const int count)117 TransferData::countToFinalSymbol(const int count) {
118   const wstring count_sym = L"<RULE_NUMBER:" + to_wstring(count) + L">";
119   alphabet.includeSymbol(count_sym);
120   const int symbol = alphabet(count_sym);
121   final_symbols.insert(symbol);
122   return symbol;
123 }
124 
125 void
write(FILE * output)126 TransferData::write(FILE *output)
127 {
128   alphabet.write(output);
129 
130   transducer.minimize();
131   map<int, double> old_finals = transducer.getFinals(); // copy for later removal
132   map<int, int> finals_rules;                   // node id -> rule number
133   map<int, multimap<int, pair<int, double> > >& transitions = transducer.getTransitions();
134   // Find all arcs with "final_symbols" in the transitions, let their source node instead be final,
135   // and extract the rule number from the arc. Record relation between source node and rule number
136   // in finals_rules. It is now no longer safe to minimize -- but we already did that.
137   const wstring rule_sym_pre = L"<RULE_NUMBER:"; // see countToFinalSymbol()
138   for(map<int, multimap<int, pair<int, double> > >::const_iterator it = transitions.begin(),
139         limit = transitions.end(); it != limit; ++it)
140   {
141     const int src = it->first;
142     for(multimap<int, pair<int, double> >::const_iterator arc = it->second.begin(),
143           arclimit = it->second.end(); arc != arclimit; ++arc)
144     {
145       const int symbol = arc->first;
146       const int trg = arc->second.first;
147       const double wgt = arc->second.second;
148       if(final_symbols.count(symbol) == 0) {
149         continue;
150       }
151       if(!transducer.isFinal(trg)) {
152         continue;
153       }
154       // Extract the rule number encoded by countToFinalSymbol():
155       wstring s;
156       alphabet.getSymbol(s, symbol);
157       if(s.compare(0, rule_sym_pre.size(), rule_sym_pre) != 0) {
158         continue;
159       }
160       const int rule_num = stoi(s.substr(rule_sym_pre.size()));
161       transducer.setFinal(src, wgt);
162       finals_rules[src] = rule_num;
163     }
164   }
165   // Remove the old finals:
166   for(map<int, double>::const_iterator it = old_finals.begin(), limit = old_finals.end();
167       it != limit; ++it)
168   {
169     transducer.setFinal(it->first, it->second, false);
170   }
171 
172   transducer.write(output, alphabet.size());
173 
174   // finals_rules
175 
176   Compression::multibyte_write(finals_rules.size(), output);
177   for(map<int, int>::const_iterator it = finals_rules.begin(), limit = finals_rules.end();
178       it != limit; it++)
179   {
180     Compression::multibyte_write(it->first, output);
181     Compression::multibyte_write(it->second, output);
182   }
183 
184   // attr_items
185 
186   // precompiled regexps
187   writeRegexps(output);
188 
189   // variables
190   Compression::multibyte_write(variables.size(), output);
191   for(map<wstring, wstring, Ltstr>::const_iterator it = variables.begin(), limit = variables.end();
192       it != limit; it++)
193   {
194     Compression::wstring_write(it->first, output);
195     Compression::wstring_write(it->second, output);
196   }
197 
198   // macros
199   Compression::multibyte_write(macros.size(), output);
200   for(map<wstring, int, Ltstr>::const_iterator it = macros.begin(), limit = macros.end();
201       it != limit; it++)
202   {
203     Compression::wstring_write(it->first, output);
204     Compression::multibyte_write(it->second, output);
205   }
206 
207   // lists
208   Compression::multibyte_write(lists.size(), output);
209   for(map<wstring, set<wstring, Ltstr>, Ltstr>::const_iterator it = lists.begin(), limit = lists.end();
210       it != limit; it++)
211   {
212     Compression::wstring_write(it->first, output);
213     Compression::multibyte_write(it->second.size(), output);
214 
215     for(set<wstring, Ltstr>::const_iterator it2 = it->second.begin(), limit2 = it->second.end();
216 	it2 != limit2; it2++)
217     {
218       Compression::wstring_write(*it2, output);
219     }
220   }
221 
222 }
223 
224 void
writeRegexps(FILE * output)225 TransferData::writeRegexps(FILE *output)
226 {
227   Compression::string_write(string(pcre_version()), output);
228   Compression::multibyte_write(attr_items.size(), output);
229 
230   map<wstring, wstring, Ltstr>::iterator it, limit;
231   for(it = attr_items.begin(), limit = attr_items.end(); it != limit; it++)
232   {
233     Compression::wstring_write(it->first, output);
234     ApertiumRE my_re;
235     my_re.compile(UtfConverter::toUtf8(it->second));
236     my_re.write(output);
237     Compression::wstring_write(it->second, output);
238   }
239 }
240