1 /**
2  * Licensed to the University Corporation for Advanced Internet
3  * Development, Inc. (UCAID) under one or more contributor license
4  * agreements. See the NOTICE file distributed with this work for
5  * additional information regarding copyright ownership.
6  *
7  * UCAID licenses this file to you under the Apache License,
8  * Version 2.0 (the "License"); you may not use this file except
9  * in compliance with the License. You may obtain a copy of the
10  * License at
11  *
12  * http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing,
15  * software distributed under the License is distributed on an
16  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
17  * either express or implied. See the License for the specific
18  * language governing permissions and limitations under the License.
19  */
20 
21 /**
22  * ParserPool.cpp
23  *
24  * A thread-safe pool of parsers that share characteristics.
25  */
26 
27 #include "internal.h"
28 #include "exceptions.h"
29 #include "logging.h"
30 #include "util/CloneInputStream.h"
31 #include "util/CurlURLInputStream.h"
32 #include "util/NDC.h"
33 #include "util/PathResolver.h"
34 #include "util/ParserPool.h"
35 #include "util/Threads.h"
36 #include "util/XMLHelper.h"
37 
38 #include <sys/types.h>
39 #include <sys/stat.h>
40 #include <algorithm>
41 #include <functional>
42 #include <boost/algorithm/string.hpp>
43 #define BOOST_BIND_GLOBAL_PLACEHOLDERS
44 #include <boost/bind.hpp>
45 #include <xercesc/util/PlatformUtils.hpp>
46 #include <xercesc/util/XMLUniDefs.hpp>
47 #include <xercesc/sax/SAXException.hpp>
48 #include <xercesc/framework/MemBufInputSource.hpp>
49 #include <xercesc/framework/LocalFileInputSource.hpp>
50 #include <xercesc/framework/Wrapper4InputSource.hpp>
51 
52 using namespace xmltooling::logging;
53 using namespace xmltooling;
54 using namespace xercesc;
55 using namespace boost;
56 using namespace std;
57 
58 
59 namespace {
60     class MyErrorHandler : public DOMErrorHandler {
61     public:
62         unsigned int errors;
63 
MyErrorHandler()64         MyErrorHandler() : errors(0) {}
65 
handleError(const DOMError & e)66         bool handleError(const DOMError& e)
67         {
68 #ifdef _DEBUG
69             xmltooling::NDC ndc("handleError");
70 #endif
71             Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
72 
73             DOMLocator* locator=e.getLocation();
74             auto_ptr_char temp(e.getMessage());
75 
76             switch (e.getSeverity()) {
77                 case DOMError::DOM_SEVERITY_WARNING:
78                     log.warnStream() << "warning on line " << locator->getLineNumber()
79                         << ", column " << locator->getColumnNumber()
80                         << ", message: " << temp.get() << logging::eol;
81                     return true;
82 
83                 case DOMError::DOM_SEVERITY_ERROR:
84                     ++errors;
85                     log.errorStream() << "error on line " << locator->getLineNumber()
86                         << ", column " << locator->getColumnNumber()
87                         << ", message: " << temp.get() << logging::eol;
88                     return true;
89 
90                 case DOMError::DOM_SEVERITY_FATAL_ERROR:
91                     ++errors;
92                     log.errorStream() << "fatal error on line " << locator->getLineNumber()
93                         << ", column " << locator->getColumnNumber()
94                         << ", message: " << temp.get() << logging::eol;
95                     return true;
96             }
97 
98             ++errors;
99             log.errorStream() << "undefined error type on line " << locator->getLineNumber()
100                 << ", column " << locator->getColumnNumber()
101                 << ", message: " << temp.get() << logging::eol;
102             return false;
103         }
104     };
105 }
106 
107 
ParserPool(bool namespaceAware,bool schemaAware)108 ParserPool::ParserPool(bool namespaceAware, bool schemaAware)
109         : m_namespaceAware(namespaceAware), m_schemaAware(schemaAware), m_lock(Mutex::create()), m_security(new SecurityManager()) {
110 
111     int expLimit = 0;
112     const char* env = getenv("XMLTOOLING_ENTITY_EXPANSION_LIMIT");
113     if (env) {
114         expLimit = atoi(env);
115     }
116     if (expLimit <= 0)
117         expLimit = XMLTOOLING_ENTITY_EXPANSION_LIMIT;
118     m_security->setEntityExpansionLimit(expLimit);
119 }
120 
~ParserPool()121 ParserPool::~ParserPool()
122 {
123     while(!m_pool.empty()) {
124         m_pool.top()->release();
125         m_pool.pop();
126     }
127 }
128 
newDocument()129 DOMDocument* ParserPool::newDocument()
130 {
131     return DOMImplementationRegistry::getDOMImplementation(nullptr)->createDocument();
132 }
133 
parse(DOMLSInput & domsrc)134 DOMDocument* ParserPool::parse(DOMLSInput& domsrc)
135 {
136     DOMLSParser* parser=checkoutBuilder();
137     XercesJanitor<DOMLSParser> janitor(parser);
138     try {
139         MyErrorHandler deh;
140         parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, dynamic_cast<DOMErrorHandler*>(&deh));
141         DOMDocument* doc=parser->parse(&domsrc);
142         if (deh.errors) {
143             if (doc)
144                 doc->release();
145             throw XMLParserException("XML error(s) during parsing, check log for specifics");
146         }
147         parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
148         parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
149         checkinBuilder(janitor.release());
150         return doc;
151     }
152     catch (const DOMException& ex) {
153         parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
154         parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
155         checkinBuilder(janitor.release());
156         auto_ptr_char temp(ex.getMessage());
157         throw XMLParserException(string("DOM error during parsing: ") + (temp.get() ? temp.get() : "no message"));
158     }
159     catch (const SAXException& ex) {
160         parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
161         parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
162         checkinBuilder(janitor.release());
163         auto_ptr_char temp(ex.getMessage());
164         throw XMLParserException(string("SAX error during parsing: ") + (temp.get() ? temp.get() : "no message"));
165     }
166     catch (const XMLException& ex) {
167         parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
168         parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
169         checkinBuilder(janitor.release());
170         auto_ptr_char temp(ex.getMessage());
171         throw XMLParserException(string("Xerces error during parsing: ") + (temp.get() ? temp.get() : "no message"));
172     }
173     catch (const XMLToolingException&) {
174         parser->getDomConfig()->setParameter(XMLUni::fgDOMErrorHandler, (void*)nullptr);
175         parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
176         checkinBuilder(janitor.release());
177         throw;
178     }
179 }
180 
parse(istream & is)181 DOMDocument* ParserPool::parse(istream& is)
182 {
183     StreamInputSource src(is);
184     Wrapper4InputSource domsrc(&src,false);
185     return parse(domsrc);
186 }
187 
188 // Functor to double its argument separated by a character and append to a buffer
189 template <class T> class doubleit {
190 public:
doubleit(T & t,const typename T::value_type & s)191     doubleit(T& t, const typename T::value_type& s) : temp(t), sep(s) {}
operator ()(const pair<const T,T> & s)192     void operator() (const pair<const T,T>& s) { temp += s.first + sep + s.first + sep; }
193     T& temp;
194     const typename T::value_type& sep;
195 };
196 
loadSchema(const XMLCh * nsURI,const XMLCh * pathname)197 bool ParserPool::loadSchema(const XMLCh* nsURI, const XMLCh* pathname)
198 {
199     // Just check the pathname and then directly register the pair into the map.
200 
201     auto_ptr_char p(pathname);
202 #ifdef WIN32
203     struct _stat stat_buf;
204     if (_stat(p.get(), &stat_buf) != 0)
205 #else
206     struct stat stat_buf;
207     if (stat(p.get(), &stat_buf) != 0)
208 #endif
209     {
210 #if _DEBUG
211         xmltooling::NDC ndc("loadSchema");
212 #endif
213         Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
214         auto_ptr_char n(nsURI);
215         log.error("failed to load schema for (%s), file not found (%s)",n.get(),p.get());
216         return false;
217     }
218 
219     // Roundtrip to local code page and back to translate path as needed.
220     string topath(p.get());
221     XMLToolingConfig::getConfig().getPathResolver()->resolve(topath, PathResolver::XMLTOOLING_XML_FILE);
222     auto_ptr_XMLCh temp(topath.c_str());
223 
224     Lock lock(m_lock);
225     m_schemaLocMap[nsURI] = temp.get();
226     m_schemaLocations.erase();
227     for_each(m_schemaLocMap.begin(), m_schemaLocMap.end(), doubleit<xstring>(m_schemaLocations,chSpace));
228 
229     return true;
230 }
231 
loadCatalogs(const char * pathnames)232 bool ParserPool::loadCatalogs(const char* pathnames)
233 {
234     string temp(pathnames);
235     trim(temp);
236     vector<string> catpaths;
237     split(catpaths, temp, is_any_of(PATH_SEPARATOR_STR), algorithm::token_compress_on);
238 
239     for (vector<string>::const_iterator i = catpaths.begin(); i != catpaths.end(); ++i) {
240         loadCatalog(i->c_str());
241     }
242 
243     return !catpaths.empty();
244 }
245 
loadCatalog(const char * pathname)246 bool ParserPool::loadCatalog(const char* pathname)
247 {
248     string p(pathname);
249     XMLToolingConfig::getConfig().getPathResolver()->resolve(p, PathResolver::XMLTOOLING_XML_FILE);
250     auto_ptr_XMLCh temp(p.c_str());
251     return loadCatalog(temp.get());
252 }
253 
loadCatalog(const XMLCh * pathname)254 bool ParserPool::loadCatalog(const XMLCh* pathname)
255 {
256 #if _DEBUG
257     xmltooling::NDC ndc("loadCatalog");
258 #endif
259     Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
260 
261     // XML constants
262     static const XMLCh catalog[] =  UNICODE_LITERAL_7(c,a,t,a,l,o,g);
263     static const XMLCh system[] =   UNICODE_LITERAL_6(s,y,s,t,e,m);
264     static const XMLCh systemId[] = UNICODE_LITERAL_8(s,y,s,t,e,m,I,d);
265     static const XMLCh uri[] =      UNICODE_LITERAL_3(u,r,i);
266     static const XMLCh CATALOG_NS[] = {
267         chLatin_u, chLatin_r, chLatin_n, chColon,
268         chLatin_o, chLatin_a, chLatin_s, chLatin_i, chLatin_s, chColon,
269         chLatin_n, chLatin_a, chLatin_m, chLatin_e, chLatin_s, chColon,
270         chLatin_t, chLatin_c, chColon,
271         chLatin_e, chLatin_n, chLatin_t, chLatin_i, chLatin_t, chLatin_y, chColon,
272         chLatin_x, chLatin_m, chLatin_l, chLatin_n, chLatin_s, chColon,
273         chLatin_x, chLatin_m, chLatin_l, chColon,
274         chLatin_c, chLatin_a, chLatin_t, chLatin_a, chLatin_l, chLatin_o, chLatin_g, chNull
275     };
276 
277     // Parse the catalog with the internal parser pool.
278 
279     if (log.isDebugEnabled()) {
280         auto_ptr_char temp(pathname);
281         log.debug("loading XML catalog from %s", temp.get());
282     }
283 
284     LocalFileInputSource fsrc(nullptr,pathname);
285     Wrapper4InputSource domsrc(&fsrc,false);
286     try {
287         DOMDocument* doc=XMLToolingConfig::getConfig().getParser().parse(domsrc);
288         XercesJanitor<DOMDocument> janitor(doc);
289 
290         // Check root element.
291         const DOMElement* root=doc->getDocumentElement();
292         if (!XMLHelper::isNodeNamed(root,CATALOG_NS,catalog)) {
293             auto_ptr_char temp(pathname);
294             log.error("unknown root element, failed to load XML catalog from %s", temp.get());
295             return false;
296         }
297 
298         // Fetch all the <system> elements.
299         DOMNodeList* mappings = root->getElementsByTagNameNS(CATALOG_NS,system);
300         Lock lock(m_lock);
301         for (XMLSize_t i = 0; i < mappings->getLength(); i++) {
302             root = static_cast<DOMElement*>(mappings->item(i));
303             const XMLCh* from = root->getAttributeNS(nullptr,systemId);
304             const XMLCh* to = root->getAttributeNS(nullptr,uri);
305 
306             // Roundtrip to local code page and back to translate path as needed.
307             auto_ptr_char temp(to);
308             string topath(temp.get());
309             XMLToolingConfig::getConfig().getPathResolver()->resolve(topath, PathResolver::XMLTOOLING_XML_FILE);
310             auto_ptr_XMLCh temp2(topath.c_str());
311 
312             m_schemaLocMap[from] = temp2.get();
313         }
314         m_schemaLocations.erase();
315         for_each(m_schemaLocMap.begin(), m_schemaLocMap.end(), doubleit<xstring>(m_schemaLocations,chSpace));
316     }
317     catch (std::exception& e) {
318         log.error("catalog loader caught exception: %s", e.what());
319         return false;
320     }
321 
322     return true;
323 }
324 
resolveResource(const XMLCh * const resourceType,const XMLCh * const namespaceUri,const XMLCh * const publicId,const XMLCh * const systemId,const XMLCh * const baseURI)325 DOMLSInput* ParserPool::resolveResource(
326             const XMLCh *const resourceType,
327             const XMLCh *const namespaceUri,
328             const XMLCh *const publicId,
329             const XMLCh *const systemId,
330             const XMLCh *const baseURI
331             )
332 {
333 #if _DEBUG
334     xmltooling::NDC ndc("resolveEntity");
335 #endif
336     if (!systemId)
337         return nullptr;
338     xstring sysId(systemId);
339 
340     Category& log=Category::getInstance(XMLTOOLING_LOGCAT ".ParserPool");
341     if (log.isDebugEnabled()) {
342         auto_ptr_char sysId(systemId);
343         auto_ptr_char base(baseURI);
344         log.debug("asked to resolve %s with baseURI %s",sysId.get(),base.get() ? base.get() : "(null)");
345     }
346 
347     // Find well-known schemas in the specified location.
348     map<xstring,xstring>::const_iterator i = m_schemaLocMap.find(sysId);
349     if (i != m_schemaLocMap.end())
350         return new Wrapper4InputSource(new LocalFileInputSource(baseURI, i->second.c_str()));
351 
352     // Check for entity as a suffix of a value in the map.
353     bool (*p_ends_with)(const xstring&, const xstring&) = ends_with;
354     i = find_if(
355         m_schemaLocMap.begin(), m_schemaLocMap.end(),
356         boost::bind(p_ends_with, boost::bind(&map<xstring,xstring>::value_type::second, _1), boost::ref(sysId))
357         );
358     if (i != m_schemaLocMap.end())
359         return new Wrapper4InputSource(new LocalFileInputSource(baseURI, i->second.c_str()));
360 
361     // We'll allow anything without embedded slashes.
362     if (XMLString::indexOf(systemId, chForwardSlash) == -1 && XMLString::indexOf(systemId, chBackSlash) == -1)
363         return new Wrapper4InputSource(new LocalFileInputSource(baseURI, systemId));
364 
365     // Shortcircuit the request.
366     auto_ptr_char temp(systemId);
367     log.debug("unauthorized entity request (%s), blocking it", temp.get());
368     static const XMLByte nullbuf[] = {0};
369     return new Wrapper4InputSource(new MemBufInputSource(nullbuf, 0, systemId));
370 }
371 
createBuilder()372 DOMLSParser* ParserPool::createBuilder()
373 {
374     static const XMLCh impltype[] = { chLatin_L, chLatin_S, chNull };
375     DOMImplementation* impl=DOMImplementationRegistry::getDOMImplementation(impltype);
376     DOMLSParser* parser=static_cast<DOMImplementationLS*>(impl)->createLSParser(DOMImplementationLS::MODE_SYNCHRONOUS,nullptr);
377     parser->getDomConfig()->setParameter(XMLUni::fgDOMNamespaces, m_namespaceAware);
378     if (m_schemaAware) {
379         parser->getDomConfig()->setParameter(XMLUni::fgDOMNamespaces, true);
380         parser->getDomConfig()->setParameter(XMLUni::fgXercesSchema, true);
381         parser->getDomConfig()->setParameter(XMLUni::fgDOMValidate, true);
382         parser->getDomConfig()->setParameter(XMLUni::fgXercesCacheGrammarFromParse, true);
383 
384         // We build a "fake" schema location hint that binds each namespace to itself.
385         // This ensures the entity resolver will be given the namespace as a systemId it can check.
386         parser->getDomConfig()->setParameter(XMLUni::fgXercesSchemaExternalSchemaLocation, const_cast<XMLCh*>(m_schemaLocations.c_str()));
387     }
388     parser->getDomConfig()->setParameter(XMLUni::fgXercesUserAdoptsDOMDocument, true);
389     parser->getDomConfig()->setParameter(XMLUni::fgXercesDisableDefaultEntityResolution, true);
390     parser->getDomConfig()->setParameter(XMLUni::fgDOMDisallowDoctype, true);
391     parser->getDomConfig()->setParameter(XMLUni::fgDOMComments, false);
392     parser->getDomConfig()->setParameter(XMLUni::fgDOMResourceResolver, dynamic_cast<DOMLSResourceResolver*>(this));
393     parser->getDomConfig()->setParameter(XMLUni::fgXercesSecurityManager, m_security.get());
394     return parser;
395 }
396 
checkoutBuilder()397 DOMLSParser* ParserPool::checkoutBuilder()
398 {
399     Lock lock(m_lock);
400     if (m_pool.empty()) {
401         DOMLSParser* builder=createBuilder();
402         return builder;
403     }
404     DOMLSParser* p=m_pool.top();
405     m_pool.pop();
406     if (m_schemaAware)
407         p->getDomConfig()->setParameter(XMLUni::fgXercesSchemaExternalSchemaLocation, const_cast<XMLCh*>(m_schemaLocations.c_str()));
408     return p;
409 }
410 
checkinBuilder(DOMLSParser * builder)411 void ParserPool::checkinBuilder(DOMLSParser* builder)
412 {
413     if (builder) {
414         Lock lock(m_lock);
415         m_pool.push(builder);
416     }
417 }
418 
StreamInputSource(istream & is,const char * systemId)419 StreamInputSource::StreamInputSource(istream& is, const char* systemId) : InputSource(systemId), m_is(is)
420 {
421 }
422 
makeStream() const423 BinInputStream* StreamInputSource::makeStream() const
424 {
425     return new StreamBinInputStream(m_is);
426 }
427 
StreamBinInputStream(istream & is)428 StreamInputSource::StreamBinInputStream::StreamBinInputStream(istream& is) : m_is(is), m_pos(0)
429 {
430 }
431 
curPos() const432 XMLFilePos StreamInputSource::StreamBinInputStream::curPos() const
433 {
434     return m_pos;
435 }
436 
getContentType() const437 const XMLCh* StreamInputSource::StreamBinInputStream::getContentType() const
438 {
439     return nullptr;
440 }
441 
readBytes(XMLByte * const toFill,const XMLSize_t maxToRead)442 XMLSize_t StreamInputSource::StreamBinInputStream::readBytes(XMLByte* const toFill, const XMLSize_t maxToRead)
443 {
444     XMLByte* target=toFill;
445     XMLSize_t bytes_read=0,request=maxToRead;
446 
447     // Fulfill the rest by reading from the stream.
448     if (request && !m_is.eof() && !m_is.fail()) {
449         try {
450             m_is.read(reinterpret_cast<char* const>(target),request);
451             m_pos+=m_is.gcount();
452             bytes_read+=m_is.gcount();
453         }
454         catch(ios_base::failure& e) {
455             Category::getInstance(XMLTOOLING_LOGCAT ".StreamInputSource").critStream()
456                 << "XML::StreamInputSource::StreamBinInputStream::readBytes caught an exception: " << e.what()
457                 << logging::eol;
458             *toFill=0;
459             return 0;
460         }
461     }
462     return bytes_read;
463 }
464 
465 #ifdef XMLTOOLING_LITE
466 
URLInputSource(const XMLCh * url,const char * systemId,string * cacheTag,std::string backingFile)467 URLInputSource::URLInputSource(const XMLCh* url, const char* systemId, string* cacheTag, std::string backingFile) : InputSource(systemId), m_backingFile(backingFile), m_url(url)
468 {
469 }
470 
URLInputSource(const DOMElement * e,const char * systemId,string * cacheTag,std::string backingFile)471 URLInputSource::URLInputSource(const DOMElement* e, const char* systemId, string* cacheTag, std::string backingFile) : InputSource(systemId), m_backingFile(backingFile)
472 {
473     static const XMLCh uri[] = UNICODE_LITERAL_3(u,r,i);
474     static const XMLCh url[] = UNICODE_LITERAL_3(u,r,l);
475 
476     const XMLCh* attr = e->getAttributeNS(nullptr, url);
477     if (!attr || !*attr) {
478         attr = e->getAttributeNS(nullptr, uri);
479         if (!attr || !*attr)
480             throw IOException("No URL supplied via DOM to URLInputSource constructor.");
481     }
482 
483     m_url.setURL(attr);
484 }
485 
makeStream() const486 BinInputStream* URLInputSource::makeStream() const
487 {
488     // Ask the URL to create us an appropriate input stream
489     return ("" == m_backingFile) ?  m_url.makeNewStream() : new CloneInputStream(m_url.makeNewStream(), m_backingFile);
490 }
491 
492 #else
493 
URLInputSource(const XMLCh * url,const char * systemId,string * cacheTag,std::string backingFile)494 URLInputSource::URLInputSource(const XMLCh* url, const char* systemId, string* cacheTag, std::string backingFile)
495     : InputSource(systemId), m_backingFile(backingFile), m_cacheTag(cacheTag), m_url(url), m_root(nullptr)
496 {
497 }
498 
URLInputSource(const DOMElement * e,const char * systemId,string * cacheTag,std::string backingFile)499 URLInputSource::URLInputSource(const DOMElement* e, const char* systemId, string* cacheTag, std::string backingFile)
500     : InputSource(systemId), m_backingFile(backingFile), m_cacheTag(cacheTag), m_root(e)
501 {
502 }
503 
makeStream() const504 BinInputStream* URLInputSource::makeStream() const
505 {
506     BinInputStream*  stream = m_root ? new CurlURLInputStream(m_root, m_cacheTag) : new CurlURLInputStream(m_url.get(), m_cacheTag);
507     return (m_backingFile.empty()) ? stream : new CloneInputStream(stream, m_backingFile);
508 }
509 
510 #endif
511 
512 const char URLInputSource::asciiStatusCodeElementName[] = "URLInputSourceStatus";
513 
514 const XMLCh URLInputSource::utf16StatusCodeElementName[] = UNICODE_LITERAL_20(U,R,L,I,n,p,u,t,S,o,u,r,c,e,S,t,a,t,u,s);
515