1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #include "orcus/xml_namespace.hpp"
9 #include "orcus/exception.hpp"
10 #include "orcus/string_pool.hpp"
11 #include "orcus/global.hpp"
12 
13 #include <unordered_map>
14 #include <vector>
15 #include <limits>
16 #include <sstream>
17 #include <algorithm>
18 #include <cassert>
19 
20 #define ORCUS_DEBUG_XML_NAMESPACE 0
21 
22 using namespace std;
23 
24 #if ORCUS_DEBUG_XML_NAMESPACE
25 #include <cstdio>
26 #include <iostream>
27 #endif
28 
29 namespace orcus {
30 
31 namespace {
32 
33 #if ORCUS_DEBUG_XML_NAMESPACE
34 template<typename _MapType>
print_map_keys(const _MapType & map_store)35 void print_map_keys(const _MapType& map_store)
36 {
37     cout << "keys: (";
38     bool first = true;
39     typename _MapType::const_iterator it = map_store.begin(), it_end = map_store.end();
40     for (; it != it_end; ++it)
41     {
42         if (first)
43             first = false;
44         else
45             cout << " ";
46         cout << "'" << it->first << "'";
47     }
48     cout << ")";
49 };
50 #endif
51 
52 }
53 
54 typedef std::unordered_map<pstring, size_t, pstring::hash> strid_map_type;
55 
56 struct xmlns_repository::impl
57 {
58     size_t m_predefined_ns_size;
59     string_pool m_pool; /// storage of live string instances.
60     std::vector<pstring> m_identifiers; /// map strings to numerical identifiers.
61     strid_map_type m_strid_map; /// string-to-numerical identifiers map for quick lookup.
62 
implorcus::xmlns_repository::impl63     impl() : m_predefined_ns_size(0) {}
64 };
65 
xmlns_repository()66 xmlns_repository::xmlns_repository() : mp_impl(orcus::make_unique<impl>()) {}
~xmlns_repository()67 xmlns_repository::~xmlns_repository() {}
68 
intern(const pstring & uri)69 xmlns_id_t xmlns_repository::intern(const pstring& uri)
70 {
71     // See if the uri is already registered.
72     strid_map_type::iterator it = mp_impl->m_strid_map.find(uri);
73     if (it != mp_impl->m_strid_map.end())
74         return it->first.get();
75 
76     try
77     {
78         std::pair<pstring, bool> r = mp_impl->m_pool.intern(uri);
79         pstring uri_interned = r.first;
80         if (!uri_interned.empty())
81         {
82             // Intern successful.
83             if (r.second)
84             {
85                 // This is a new instance. Assign a numerical identifier.
86                 mp_impl->m_strid_map.insert(
87                     strid_map_type::value_type(r.first, mp_impl->m_identifiers.size()));
88 #if ORCUS_DEBUG_XML_NAMESPACE
89                 cout << "xmlns_repository::intern: uri='" << uri_interned << "' (" << mp_impl->m_identifiers.size() << ")" << endl;
90 #endif
91                 mp_impl->m_identifiers.push_back(r.first);
92 
93 #if ORCUS_DEBUG_XML_NAMESPACE
94                 cout << "pool size=" << mp_impl->m_pool.size() << ", predefined ns size=" << mp_impl->m_predefined_ns_size <<
95                     ", identifiers size=" << mp_impl->m_identifiers.size() << ", map size=" << mp_impl->m_strid_map.size() << endl;
96 #endif
97                 assert(mp_impl->m_pool.size()+mp_impl->m_predefined_ns_size == mp_impl->m_identifiers.size());
98                 assert(mp_impl->m_pool.size()+mp_impl->m_predefined_ns_size == mp_impl->m_strid_map.size());
99             }
100             return uri_interned.get();
101         }
102     }
103     catch (const general_error&)
104     {
105     }
106 
107     return XMLNS_UNKNOWN_ID;
108 }
109 
add_predefined_values(const xmlns_id_t * predefined_ns)110 void xmlns_repository::add_predefined_values(const xmlns_id_t* predefined_ns)
111 {
112     if (!predefined_ns)
113         return;
114 
115     const xmlns_id_t* val = &predefined_ns[0];
116     for (; *val; ++val)
117     {
118         pstring s(*val);
119         mp_impl->m_strid_map.insert(
120             strid_map_type::value_type(s, mp_impl->m_identifiers.size()));
121         mp_impl->m_identifiers.push_back(s);
122 
123         ++mp_impl->m_predefined_ns_size;
124 
125 #if ORCUS_DEBUG_XML_NAMESPACE
126         cout << "xlmns_repository: predefined ns='" << s << "'" << endl;
127 #endif
128     }
129 }
130 
create_context()131 xmlns_context xmlns_repository::create_context()
132 {
133     return xmlns_context(*this);
134 }
135 
get_identifier(size_t index) const136 xmlns_id_t xmlns_repository::get_identifier(size_t index) const
137 {
138     if (index >= mp_impl->m_identifiers.size())
139         return XMLNS_UNKNOWN_ID;
140 
141     // All identifier strings are interned which means they are all null-terminated.
142     return mp_impl->m_identifiers[index].get();
143 }
144 
get_short_name(xmlns_id_t ns_id) const145 string xmlns_repository::get_short_name(xmlns_id_t ns_id) const
146 {
147     size_t index = get_index(ns_id);
148     return get_short_name(index);
149 }
150 
get_short_name(size_t index) const151 string xmlns_repository::get_short_name(size_t index) const
152 {
153     if (index == index_not_found)
154         return string("???");
155 
156     ostringstream os;
157     os << "ns" << index;
158     return os.str();
159 }
160 
get_index(xmlns_id_t ns_id) const161 size_t xmlns_repository::get_index(xmlns_id_t ns_id) const
162 {
163     if (!ns_id)
164         return index_not_found;
165 
166     strid_map_type::const_iterator it = mp_impl->m_strid_map.find(pstring(ns_id));
167     if (it == mp_impl->m_strid_map.end())
168         return index_not_found;
169 
170     return it->second;
171 }
172 
173 typedef std::vector<xmlns_id_t> xmlns_list_type;
174 typedef std::unordered_map<pstring, xmlns_list_type, pstring::hash> alias_map_type;
175 
176 struct xmlns_context::impl
177 {
178     xmlns_repository& m_repo;
179     xmlns_list_type m_all_ns; /// all namespaces ever used in this context.
180     xmlns_list_type m_default;
181     alias_map_type m_map;
182 
183     bool m_trim_all_ns;
184 
implorcus::xmlns_context::impl185     impl(xmlns_repository& repo) : m_repo(repo), m_trim_all_ns(true) {}
implorcus::xmlns_context::impl186     impl(const impl& r) :
187         m_repo(r.m_repo), m_all_ns(r.m_all_ns), m_default(r.m_default), m_map(r.m_map), m_trim_all_ns(r.m_trim_all_ns) {}
188 };
189 
xmlns_context(xmlns_repository & repo)190 xmlns_context::xmlns_context(xmlns_repository& repo) : mp_impl(orcus::make_unique<impl>(repo)) {}
xmlns_context(const xmlns_context & r)191 xmlns_context::xmlns_context(const xmlns_context& r) : mp_impl(orcus::make_unique<impl>(*r.mp_impl)) {}
~xmlns_context()192 xmlns_context::~xmlns_context() {}
193 
push(const pstring & key,const pstring & uri)194 xmlns_id_t xmlns_context::push(const pstring& key, const pstring& uri)
195 {
196 #if ORCUS_DEBUG_XML_NAMESPACE
197     cout << "xmlns_context::push: key='" << key << "', uri='" << uri << "'" << endl;
198 #endif
199     mp_impl->m_trim_all_ns = true;
200 
201     pstring uri_interned = mp_impl->m_repo.intern(uri);
202 
203     if (key.empty())
204     {
205         // empty key value is associated with default namespace.
206         mp_impl->m_default.push_back(uri_interned.get());
207         mp_impl->m_all_ns.push_back(uri_interned.get());
208         return mp_impl->m_default.back();
209     }
210 
211     // See if this key already exists.
212     alias_map_type::iterator it = mp_impl->m_map.find(key);
213     if (it == mp_impl->m_map.end())
214     {
215         // This is the first time this key is used.
216         xmlns_list_type nslist;
217         nslist.push_back(uri_interned.get());
218         mp_impl->m_all_ns.push_back(uri_interned.get());
219         std::pair<alias_map_type::iterator,bool> r =
220             mp_impl->m_map.insert(alias_map_type::value_type(key, nslist));
221 
222         if (!r.second)
223             // insertion failed.
224             throw general_error("Failed to insert new namespace.");
225 
226         return nslist.back();
227     }
228 
229     // The key already exists.
230     xmlns_list_type& nslist = it->second;
231     nslist.push_back(uri_interned.get());
232     mp_impl->m_all_ns.push_back(uri_interned.get());
233     return nslist.back();
234 }
235 
pop(const pstring & key)236 void xmlns_context::pop(const pstring& key)
237 {
238 #if ORCUS_DEBUG_XML_NAMESPACE
239     cout << "xmlns_context::pop: key='" << key << "'" << endl;
240 #endif
241     if (key.empty())
242     {
243         // empty key value is associated with default namespace.
244         if (mp_impl->m_default.empty())
245             throw general_error("default namespace stack is empty.");
246 
247         mp_impl->m_default.pop_back();
248         return;
249     }
250 
251     // See if this key really exists.
252     alias_map_type::iterator it = mp_impl->m_map.find(key);
253     if (it == mp_impl->m_map.end())
254         throw general_error("failed to find the key.");
255 
256     xmlns_list_type& nslist = it->second;
257     if (nslist.empty())
258         throw general_error("namespace stack for this key is empty.");
259 
260     nslist.pop_back();
261 }
262 
get(const pstring & key) const263 xmlns_id_t xmlns_context::get(const pstring& key) const
264 {
265 #if ORCUS_DEBUG_XML_NAMESPACE
266     cout << "xmlns_context::get: alias='" << key << "', default ns stack size="
267         << mp_impl->m_default.size() << ", non-default alias count=" << mp_impl->m_map.size();
268     cout << ", ";
269     print_map_keys(mp_impl->m_map);
270     cout << endl;
271 #endif
272     if (key.empty())
273         return mp_impl->m_default.empty() ? XMLNS_UNKNOWN_ID : mp_impl->m_default.back();
274 
275     alias_map_type::const_iterator it = mp_impl->m_map.find(key);
276     if (it == mp_impl->m_map.end())
277     {
278 #if ORCUS_DEBUG_XML_NAMESPACE
279         cout << "xmlns_context::get: alias not in this context" << endl;
280 #endif
281         return XMLNS_UNKNOWN_ID;
282     }
283 
284 #if ORCUS_DEBUG_XML_NAMESPACE
285     cout << "xmlns_context::get: alias stack size=" << it->second.size() << endl;
286 #endif
287     return it->second.empty() ? XMLNS_UNKNOWN_ID : it->second.back();
288 }
289 
get_index(xmlns_id_t ns_id) const290 size_t xmlns_context::get_index(xmlns_id_t ns_id) const
291 {
292     return mp_impl->m_repo.get_index(ns_id);
293 }
294 
get_short_name(xmlns_id_t ns_id) const295 string xmlns_context::get_short_name(xmlns_id_t ns_id) const
296 {
297     return mp_impl->m_repo.get_short_name(ns_id);
298 }
299 
get_alias(xmlns_id_t ns_id) const300 pstring xmlns_context::get_alias(xmlns_id_t ns_id) const
301 {
302     alias_map_type::const_iterator it = mp_impl->m_map.begin(), it_end = mp_impl->m_map.end();
303     for (; it != it_end; ++it)
304     {
305         const xmlns_list_type& lst = it->second;
306         if (lst.empty())
307             continue;
308 
309         if (lst.back() == ns_id)
310             return it->first;
311     }
312 
313     return pstring();
314 }
315 
316 namespace {
317 
318 #if ORCUS_DEBUG_XML_NAMESPACE
319 struct print_ns : std::unary_function<xmlns_id_t, void>
320 {
operator ()orcus::__anon87eaaa1f0211::print_ns321     void operator() (xmlns_id_t ns_id) const
322     {
323         const char* p = ns_id;
324         printf("%p: %s\n", p, p);
325     }
326 };
327 #endif
328 
329 struct ns_item
330 {
331     size_t index;
332     xmlns_id_t ns;
333 
ns_itemorcus::__anon87eaaa1f0211::ns_item334     ns_item(size_t _index, xmlns_id_t _ns) : index(_index), ns(_ns) {}
335 };
336 
337 struct less_ns_by_index : binary_function<ns_item, ns_item, bool>
338 {
operator ()orcus::__anon87eaaa1f0211::less_ns_by_index339     bool operator() (const ns_item& left, const ns_item& right) const
340     {
341         return left.index < right.index;
342     }
343 };
344 
345 class push_back_ns_to_item : unary_function<xmlns_id_t, void>
346 {
347     vector<ns_item>& m_store;
348     const xmlns_context& m_cxt;
349 public:
push_back_ns_to_item(vector<ns_item> & store,const xmlns_context & cxt)350     push_back_ns_to_item(vector<ns_item>& store, const xmlns_context& cxt) : m_store(store), m_cxt(cxt) {}
operator ()(xmlns_id_t ns)351     void operator() (xmlns_id_t ns)
352     {
353         size_t num_id = m_cxt.get_index(ns);
354         if (num_id != index_not_found)
355             m_store.push_back(ns_item(num_id, ns));
356     }
357 };
358 
359 class push_back_item_to_ns : unary_function<ns_item, void>
360 {
361     std::vector<xmlns_id_t>& m_store;
362 public:
push_back_item_to_ns(std::vector<xmlns_id_t> & store)363     push_back_item_to_ns(std::vector<xmlns_id_t>& store) : m_store(store) {}
operator ()(const ns_item & item)364     void operator() (const ns_item& item)
365     {
366         m_store.push_back(item.ns);
367     }
368 };
369 
370 }
371 
get_all_namespaces() const372 std::vector<xmlns_id_t> xmlns_context::get_all_namespaces() const
373 {
374 #if ORCUS_DEBUG_XML_NAMESPACE
375     cout << "xmlns_context::get_all_namespaces: count=" << mp_impl->m_all_ns.size() << endl;
376     std::for_each(mp_impl->m_all_ns.begin(), mp_impl->m_all_ns.end(), print_ns());
377 #endif
378 
379     std::vector<xmlns_id_t> nslist;
380 
381     if (mp_impl->m_trim_all_ns)
382     {
383         xmlns_list_type& all_ns = mp_impl->m_all_ns;
384 
385         nslist.assign(mp_impl->m_all_ns.begin(), mp_impl->m_all_ns.end());
386 
387         // Sort it and remove duplicate.
388         std::sort(all_ns.begin(), all_ns.end());
389         xmlns_list_type::iterator it_unique_end =
390             std::unique(all_ns.begin(), all_ns.end());
391         all_ns.erase(it_unique_end, all_ns.end());
392 
393         // Now, sort by indices.
394         vector<ns_item> items;
395         std::for_each(all_ns.begin(), all_ns.end(), push_back_ns_to_item(items, *this));
396         std::sort(items.begin(), items.end(), less_ns_by_index());
397 
398         all_ns.clear();
399         std::for_each(items.begin(), items.end(), push_back_item_to_ns(all_ns));
400 
401         mp_impl->m_trim_all_ns = false;
402     }
403 
404     nslist.assign(mp_impl->m_all_ns.begin(), mp_impl->m_all_ns.end());
405     return nslist;
406 }
407 
dump(std::ostream & os) const408 void xmlns_context::dump(std::ostream& os) const
409 {
410     vector<xmlns_id_t> nslist = get_all_namespaces();
411     vector<xmlns_id_t>::const_iterator it = nslist.begin(), it_end = nslist.end();
412     for (; it != it_end; ++it)
413     {
414         xmlns_id_t ns_id = *it;
415         size_t num_id = get_index(ns_id);
416         if (num_id == index_not_found)
417             continue;
418 
419         os << "ns" << num_id << "=\"" << ns_id << '"' << endl;
420     }
421 }
422 
423 }
424 
425 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
426