1 /**
2 * Licensed to the University Corporation for Advanced Internet
3 * Development, Inc. (UCAID) under one or more contributor license
4 * agreements. See the NOTICE file distributed with this work for
5 * additional information regarding copyright ownership.
6 *
7 * UCAID licenses this file to you under the Apache License,
8 * Version 2.0 (the "License"); you may not use this file except
9 * in compliance with the License. You may obtain a copy of the
10 * License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
17 * either express or implied. See the License for the specific
18 * language governing permissions and limitations under the License.
19 */
20
21 /**
22 * ParserPool.cpp
23 *
24 * A thread-safe pool of parsers that share characteristics.
25 */
26
27 #include "internal.h"
28 #include "exceptions.h"
29 #include "logging.h"
30 #include "util/CloneInputStream.h"
31 #include "util/CurlURLInputStream.h"
32 #include "util/NDC.h"
33 #include "util/PathResolver.h"
34 #include "util/ParserPool.h"
35 #include "util/Threads.h"
36 #include "util/XMLHelper.h"
37
38 #include <sys/types.h>
39 #include <sys/stat.h>
40 #include <algorithm>
41 #include <functional>
42 #include <boost/algorithm/string.hpp>
43 #define BOOST_BIND_GLOBAL_PLACEHOLDERS
44 #include <boost/bind.hpp>
45 #include <xercesc/util/PlatformUtils.hpp>
46 #include <xercesc/util/XMLUniDefs.hpp>
47 #include <xercesc/sax/SAXException.hpp>
48 #include <xercesc/framework/MemBufInputSource.hpp>
49 #include <xercesc/framework/LocalFileInputSource.hpp>
50 #include <xercesc/framework/Wrapper4InputSource.hpp>
51
52 using namespace xmltooling::logging;
53 using namespace xmltooling;
54 using namespace xercesc;
55 using namespace boost;
56 using namespace std;
57
58
59 namespace {
60 class MyErrorHandler : public DOMErrorHandler {
61 public:
62 unsigned int errors;
63
MyErrorHandler()64 MyErrorHandler() : errors(0) {}
65
handleError(const DOMError & e)66 bool handleError(const DOMError& e)
67 {
68 #ifdef _DEBUG
69 xmltooling::NDC ndc("handleError");
70 #endif
71 Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
72
73 DOMLocator* locator=e.getLocation();
74 auto_ptr_char temp(e.getMessage());
75
76 switch (e.getSeverity()) {
77 case DOMError::DOM_SEVERITY_WARNING:
78 log.warnStream() << "warning on line " << locator->getLineNumber()
79 << ", column " << locator->getColumnNumber()
80 << ", message: " << temp.get() << logging::eol;
81 return true;
82
83 case DOMError::DOM_SEVERITY_ERROR:
84 ++errors;
85 log.errorStream() << "error on line " << locator->getLineNumber()
86 << ", column " << locator->getColumnNumber()
87 << ", message: " << temp.get() << logging::eol;
88 return true;
89
90 case DOMError::DOM_SEVERITY_FATAL_ERROR:
91 ++errors;
92 log.errorStream() << "fatal error on line " << locator->getLineNumber()
93 << ", column " << locator->getColumnNumber()
94 << ", message: " << temp.get() << logging::eol;
95 return true;
96 }
97
98 ++errors;
99 log.errorStream() << "undefined error type on line " << locator->getLineNumber()
100 << ", column " << locator->getColumnNumber()
101 << ", message: " << temp.get() << logging::eol;
102 return false;
103 }
104 };
105 }
106
107
ParserPool(bool namespaceAware,bool schemaAware)108 ParserPool::ParserPool(bool namespaceAware, bool schemaAware)
109 : m_namespaceAware(namespaceAware), m_schemaAware(schemaAware), m_lock(Mutex::create()), m_security(new SecurityManager()) {
110
111 int expLimit = 0;
112 const char* env = getenv("XMLTOOLING_ENTITY_EXPANSION_LIMIT");
113 if (env) {
114 expLimit = atoi(env);
115 }
116 if (expLimit <= 0)
117 expLimit = XMLTOOLING_ENTITY_EXPANSION_LIMIT;
118 m_security->setEntityExpansionLimit(expLimit);
119 }
120
~ParserPool()121 ParserPool::~ParserPool()
122 {
123 while(!m_pool.empty()) {
124 m_pool.top()->release();
125 m_pool.pop();
126 }
127 }
128
newDocument()129 DOMDocument* ParserPool::newDocument()
130 {
131 return DOMImplementationRegistry::getDOMImplementation(nullptr)->createDocument();
132 }
133
parse(DOMLSInput & domsrc)134 DOMDocument* ParserPool::parse(DOMLSInput& domsrc)
135 {
136 DOMLSParser* parser=checkoutBuilder();
137 XercesJanitor<DOMLSParser> janitor(parser);
138 try {
139 MyErrorHandler deh;
140 parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, dynamic_cast<DOMErrorHandler*>(&deh));
141 DOMDocument* doc=parser->parse(&domsrc);
142 if (deh.errors) {
143 if (doc)
144 doc->release();
145 throw XMLParserException("XML error(s) during parsing, check log for specifics");
146 }
147 parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
148 parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
149 checkinBuilder(janitor.release());
150 return doc;
151 }
152 catch (const DOMException& ex) {
153 parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
154 parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
155 checkinBuilder(janitor.release());
156 auto_ptr_char temp(ex.getMessage());
157 throw XMLParserException(string("DOM error during parsing: ") + (temp.get() ? temp.get() : "no message"));
158 }
159 catch (const SAXException& ex) {
160 parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
161 parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
162 checkinBuilder(janitor.release());
163 auto_ptr_char temp(ex.getMessage());
164 throw XMLParserException(string("SAX error during parsing: ") + (temp.get() ? temp.get() : "no message"));
165 }
166 catch (const XMLException& ex) {
167 parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
168 parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
169 checkinBuilder(janitor.release());
170 auto_ptr_char temp(ex.getMessage());
171 throw XMLParserException(string("Xerces error during parsing: ") + (temp.get() ? temp.get() : "no message"));
172 }
173 catch (const XMLToolingException&) {
174 parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
175 parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
176 checkinBuilder(janitor.release());
177 throw;
178 }
179 }
180
parse(istream & is)181 DOMDocument* ParserPool::parse(istream& is)
182 {
183 StreamInputSource src(is);
184 Wrapper4InputSource domsrc(&src,false);
185 return parse(domsrc);
186 }
187
188 // Functor to double its argument separated by a character and append to a buffer
189 template <class T> class doubleit {
190 public:
doubleit(T & t,const typename T::value_type & s)191 doubleit(T& t, const typename T::value_type& s) : temp(t), sep(s) {}
operator ()(const pair<const T,T> & s)192 void operator() (const pair<const T,T>& s) { temp += s.first + sep + s.first + sep; }
193 T& temp;
194 const typename T::value_type& sep;
195 };
196
loadSchema(const XMLCh * nsURI,const XMLCh * pathname)197 bool ParserPool::loadSchema(const XMLCh* nsURI, const XMLCh* pathname)
198 {
199 // Just check the pathname and then directly register the pair into the map.
200
201 auto_ptr_char p(pathname);
202 #ifdef WIN32
203 struct _stat stat_buf;
204 if (_stat(p.get(), &stat_buf) != 0)
205 #else
206 struct stat stat_buf;
207 if (stat(p.get(), &stat_buf) != 0)
208 #endif
209 {
210 #if _DEBUG
211 xmltooling::NDC ndc("loadSchema");
212 #endif
213 Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
214 auto_ptr_char n(nsURI);
215 log.error("failed to load schema for (%s), file not found (%s)",n.get(),p.get());
216 return false;
217 }
218
219 // Roundtrip to local code page and back to translate path as needed.
220 string topath(p.get());
221 XMLToolingConfig::getConfig().getPathResolver()->resolve(topath, PathResolver::XMLTOOLING_XML_FILE);
222 auto_ptr_XMLCh temp(topath.c_str());
223
224 Lock lock(m_lock);
225 m_schemaLocMap[nsURI] = temp.get();
226 m_schemaLocations.erase();
227 for_each(m_schemaLocMap.begin(), m_schemaLocMap.end(), doubleit<xstring>(m_schemaLocations,chSpace));
228
229 return true;
230 }
231
loadCatalogs(const char * pathnames)232 bool ParserPool::loadCatalogs(const char* pathnames)
233 {
234 string temp(pathnames);
235 trim(temp);
236 vector<string> catpaths;
237 split(catpaths, temp, is_any_of(PATH_SEPARATOR_STR), algorithm::token_compress_on);
238
239 for (vector<string>::const_iterator i = catpaths.begin(); i != catpaths.end(); ++i) {
240 loadCatalog(i->c_str());
241 }
242
243 return !catpaths.empty();
244 }
245
loadCatalog(const char * pathname)246 bool ParserPool::loadCatalog(const char* pathname)
247 {
248 string p(pathname);
249 XMLToolingConfig::getConfig().getPathResolver()->resolve(p, PathResolver::XMLTOOLING_XML_FILE);
250 auto_ptr_XMLCh temp(p.c_str());
251 return loadCatalog(temp.get());
252 }
253
loadCatalog(const XMLCh * pathname)254 bool ParserPool::loadCatalog(const XMLCh* pathname)
255 {
256 #if _DEBUG
257 xmltooling::NDC ndc("loadCatalog");
258 #endif
259 Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
260
261 // XML constants
262 static const XMLCh catalog[] = UNICODE_LITERAL_7(c,a,t,a,l,o,g);
263 static const XMLCh system[] = UNICODE_LITERAL_6(s,y,s,t,e,m);
264 static const XMLCh systemId[] = UNICODE_LITERAL_8(s,y,s,t,e,m,I,d);
265 static const XMLCh uri[] = UNICODE_LITERAL_3(u,r,i);
266 static const XMLCh CATALOG_NS[] = {
267 chLatin_u, chLatin_r, chLatin_n, chColon,
268 chLatin_o, chLatin_a, chLatin_s, chLatin_i, chLatin_s, chColon,
269 chLatin_n, chLatin_a, chLatin_m, chLatin_e, chLatin_s, chColon,
270 chLatin_t, chLatin_c, chColon,
271 chLatin_e, chLatin_n, chLatin_t, chLatin_i, chLatin_t, chLatin_y, chColon,
272 chLatin_x, chLatin_m, chLatin_l, chLatin_n, chLatin_s, chColon,
273 chLatin_x, chLatin_m, chLatin_l, chColon,
274 chLatin_c, chLatin_a, chLatin_t, chLatin_a, chLatin_l, chLatin_o, chLatin_g, chNull
275 };
276
277 // Parse the catalog with the internal parser pool.
278
279 if (log.isDebugEnabled()) {
280 auto_ptr_char temp(pathname);
281 log.debug("loading XML catalog from %s", temp.get());
282 }
283
284 LocalFileInputSource fsrc(nullptr,pathname);
285 Wrapper4InputSource domsrc(&fsrc,false);
286 try {
287 DOMDocument* doc=XMLToolingConfig::getConfig().getParser().parse(domsrc);
288 XercesJanitor<DOMDocument> janitor(doc);
289
290 // Check root element.
291 const DOMElement* root=doc->getDocumentElement();
292 if (!XMLHelper::isNodeNamed(root,CATALOG_NS,catalog)) {
293 auto_ptr_char temp(pathname);
294 log.error("unknown root element, failed to load XML catalog from %s", temp.get());
295 return false;
296 }
297
298 // Fetch all the <system> elements.
299 DOMNodeList* mappings = root->getElementsByTagNameNS(CATALOG_NS,system);
300 Lock lock(m_lock);
301 for (XMLSize_t i = 0; i < mappings->getLength(); i++) {
302 root = static_cast<DOMElement*>(mappings->item(i));
303 const XMLCh* from = root->getAttributeNS(nullptr,systemId);
304 const XMLCh* to = root->getAttributeNS(nullptr,uri);
305
306 // Roundtrip to local code page and back to translate path as needed.
307 auto_ptr_char temp(to);
308 string topath(temp.get());
309 XMLToolingConfig::getConfig().getPathResolver()->resolve(topath, PathResolver::XMLTOOLING_XML_FILE);
310 auto_ptr_XMLCh temp2(topath.c_str());
311
312 m_schemaLocMap[from] = temp2.get();
313 }
314 m_schemaLocations.erase();
315 for_each(m_schemaLocMap.begin(), m_schemaLocMap.end(), doubleit<xstring>(m_schemaLocations,chSpace));
316 }
317 catch (std::exception& e) {
318 log.error("catalog loader caught exception: %s", e.what());
319 return false;
320 }
321
322 return true;
323 }
324
resolveResource(const XMLCh * const resourceType,const XMLCh * const namespaceUri,const XMLCh * const publicId,const XMLCh * const systemId,const XMLCh * const baseURI)325 DOMLSInput* ParserPool::resolveResource(
326 const XMLCh *const resourceType,
327 const XMLCh *const namespaceUri,
328 const XMLCh *const publicId,
329 const XMLCh *const systemId,
330 const XMLCh *const baseURI
331 )
332 {
333 #if _DEBUG
334 xmltooling::NDC ndc("resolveEntity");
335 #endif
336 if (!systemId)
337 return nullptr;
338 xstring sysId(systemId);
339
340 Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
341 if (log.isDebugEnabled()) {
342 auto_ptr_char sysId(systemId);
343 auto_ptr_char base(baseURI);
344 log.debug("asked to resolve %s with baseURI %s",sysId.get(),base.get() ? base.get() : "(null)");
345 }
346
347 // Find well-known schemas in the specified location.
348 map<xstring,xstring>::const_iterator i = m_schemaLocMap.find(sysId);
349 if (i != m_schemaLocMap.end())
350 return new Wrapper4InputSource(new LocalFileInputSource(baseURI, i->second.c_str()));
351
352 // Check for entity as a suffix of a value in the map.
353 bool (*p_ends_with)(const xstring&, const xstring&) = ends_with;
354 i = find_if(
355 m_schemaLocMap.begin(), m_schemaLocMap.end(),
356 boost::bind(p_ends_with, boost::bind(&map<xstring,xstring>::value_type::second, _1), boost::ref(sysId))
357 );
358 if (i != m_schemaLocMap.end())
359 return new Wrapper4InputSource(new LocalFileInputSource(baseURI, i->second.c_str()));
360
361 // We'll allow anything without embedded slashes.
362 if (XMLString::indexOf(systemId, chForwardSlash) == -1 && XMLString::indexOf(systemId, chBackSlash) == -1)
363 return new Wrapper4InputSource(new LocalFileInputSource(baseURI, systemId));
364
365 // Shortcircuit the request.
366 auto_ptr_char temp(systemId);
367 log.debug("unauthorized entity request (%s), blocking it", temp.get());
368 static const XMLByte nullbuf[] = {0};
369 return new Wrapper4InputSource(new MemBufInputSource(nullbuf, 0, systemId));
370 }
371
createBuilder()372 DOMLSParser* ParserPool::createBuilder()
373 {
374 static const XMLCh impltype[] = { chLatin_L, chLatin_S, chNull };
375 DOMImplementation* impl=DOMImplementationRegistry::getDOMImplementation(impltype);
376 DOMLSParser* parser=static_cast<DOMImplementationLS*>(impl)->createLSParser(DOMImplementationLS::MODE_SYNCHRONOUS,nullptr);
377 parser->getDomConfig()->setParameter(XMLUni::fgDOMNamespaces, m_namespaceAware);
378 if (m_schemaAware) {
379 parser->getDomConfig()->setParameter(XMLUni::fgDOMNamespaces, true);
380 parser->getDomConfig()->setParameter(XMLUni::fgXercesSchema, true);
381 parser->getDomConfig()->setParameter(XMLUni::fgDOMValidate, true);
382 parser->getDomConfig()->setParameter(XMLUni::fgXercesCacheGrammarFromParse, true);
383
384 // We build a "fake" schema location hint that binds each namespace to itself.
385 // This ensures the entity resolver will be given the namespace as a systemId it can check.
386 parser->getDomConfig()->setParameter(XMLUni::fgXercesSchemaExternalSchemaLocation, const_cast<XMLCh*>(m_schemaLocations.c_str()));
387 }
388 parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
389 parser->getDomConfig()->setParameter(XMLUni::fgXercesDisableDefaultEntityResolution, true);
390 parser->getDomConfig()->setParameter(XMLUni::fgDOMDisallowDoctype, true);
391 parser->getDomConfig()->setParameter(XMLUni::fgDOMComments, false);
392 parser->getDomConfig()->setParameter(XMLUni::fgDOMResourceResolver, dynamic_cast<DOMLSResourceResolver*>(this));
393 parser->getDomConfig()->setParameter(XMLUni::fgXercesSecurityManager, m_security.get());
394 return parser;
395 }
396
checkoutBuilder()397 DOMLSParser* ParserPool::checkoutBuilder()
398 {
399 Lock lock(m_lock);
400 if (m_pool.empty()) {
401 DOMLSParser* builder=createBuilder();
402 return builder;
403 }
404 DOMLSParser* p=m_pool.top();
405 m_pool.pop();
406 if (m_schemaAware)
407 p->getDomConfig()->setParameter(XMLUni::fgXercesSchemaExternalSchemaLocation, const_cast<XMLCh*>(m_schemaLocations.c_str()));
408 return p;
409 }
410
checkinBuilder(DOMLSParser * builder)411 void ParserPool::checkinBuilder(DOMLSParser* builder)
412 {
413 if (builder) {
414 Lock lock(m_lock);
415 m_pool.push(builder);
416 }
417 }
418
StreamInputSource(istream & is,const char * systemId)419 StreamInputSource::StreamInputSource(istream& is, const char* systemId) : InputSource(systemId), m_is(is)
420 {
421 }
422
makeStream() const423 BinInputStream* StreamInputSource::makeStream() const
424 {
425 return new StreamBinInputStream(m_is);
426 }
427
StreamBinInputStream(istream & is)428 StreamInputSource::StreamBinInputStream::StreamBinInputStream(istream& is) : m_is(is), m_pos(0)
429 {
430 }
431
curPos() const432 XMLFilePos StreamInputSource::StreamBinInputStream::curPos() const
433 {
434 return m_pos;
435 }
436
getContentType() const437 const XMLCh* StreamInputSource::StreamBinInputStream::getContentType() const
438 {
439 return nullptr;
440 }
441
readBytes(XMLByte * const toFill,const XMLSize_t maxToRead)442 XMLSize_t StreamInputSource::StreamBinInputStream::readBytes(XMLByte* const toFill, const XMLSize_t maxToRead)
443 {
444 XMLByte* target=toFill;
445 XMLSize_t bytes_read=0,request=maxToRead;
446
447 // Fulfill the rest by reading from the stream.
448 if (request && !m_is.eof() && !m_is.fail()) {
449 try {
450 m_is.read(reinterpret_cast<char* const>(target),request);
451 m_pos+=m_is.gcount();
452 bytes_read+=m_is.gcount();
453 }
454 catch(ios_base::failure& e) {
455 Category::getInstance(XMLTOOLING_LOGCAT ".StreamInputSource").critStream()
456 << "XML::StreamInputSource::StreamBinInputStream::readBytes caught an exception: " << e.what()
457 << logging::eol;
458 *toFill=0;
459 return 0;
460 }
461 }
462 return bytes_read;
463 }
464
465 #ifdef XMLTOOLING_LITE
466
URLInputSource(const XMLCh * url,const char * systemId,string * cacheTag,std::string backingFile)467 URLInputSource::URLInputSource(const XMLCh* url, const char* systemId, string* cacheTag, std::string backingFile) : InputSource(systemId), m_backingFile(backingFile), m_url(url)
468 {
469 }
470
URLInputSource(const DOMElement * e,const char * systemId,string * cacheTag,std::string backingFile)471 URLInputSource::URLInputSource(const DOMElement* e, const char* systemId, string* cacheTag, std::string backingFile) : InputSource(systemId), m_backingFile(backingFile)
472 {
473 static const XMLCh uri[] = UNICODE_LITERAL_3(u,r,i);
474 static const XMLCh url[] = UNICODE_LITERAL_3(u,r,l);
475
476 const XMLCh* attr = e->getAttributeNS(nullptr, url);
477 if (!attr || !*attr) {
478 attr = e->getAttributeNS(nullptr, uri);
479 if (!attr || !*attr)
480 throw IOException("No URL supplied via DOM to URLInputSource constructor.");
481 }
482
483 m_url.setURL(attr);
484 }
485
makeStream() const486 BinInputStream* URLInputSource::makeStream() const
487 {
488 // Ask the URL to create us an appropriate input stream
489 return ("" == m_backingFile) ? m_url.makeNewStream() : new CloneInputStream(m_url.makeNewStream(), m_backingFile);
490 }
491
492 #else
493
URLInputSource(const XMLCh * url,const char * systemId,string * cacheTag,std::string backingFile)494 URLInputSource::URLInputSource(const XMLCh* url, const char* systemId, string* cacheTag, std::string backingFile)
495 : InputSource(systemId), m_backingFile(backingFile), m_cacheTag(cacheTag), m_url(url), m_root(nullptr)
496 {
497 }
498
URLInputSource(const DOMElement * e,const char * systemId,string * cacheTag,std::string backingFile)499 URLInputSource::URLInputSource(const DOMElement* e, const char* systemId, string* cacheTag, std::string backingFile)
500 : InputSource(systemId), m_backingFile(backingFile), m_cacheTag(cacheTag), m_root(e)
501 {
502 }
503
makeStream() const504 BinInputStream* URLInputSource::makeStream() const
505 {
506 BinInputStream* stream = m_root ? new CurlURLInputStream(m_root, m_cacheTag) : new CurlURLInputStream(m_url.get(), m_cacheTag);
507 return (m_backingFile.empty()) ? stream : new CloneInputStream(stream, m_backingFile);
508 }
509
510 #endif
511
512 const char URLInputSource::asciiStatusCodeElementName[] = "URLInputSourceStatus";
513
514 const XMLCh URLInputSource::utf16StatusCodeElementName[] = UNICODE_LITERAL_20(U,R,L,I,n,p,u,t,S,o,u,r,c,e,S,t,a,t,u,s);
515