1 /**********************************************************************
2 Copyright (C) 2005-2006 by Chris Morley
3 Some portions Copyright (C) 2006 by Geoffrey R. Hutchison
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation version 2 of the License.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13 ***********************************************************************/
14 #include <openbabel/babelconfig.h>
15 #include <openbabel/xml.h>
16 
17 using namespace std;
18 namespace OpenBabel
19 {
20 
21   //static variable
22   XMLBaseFormat* XMLConversion::_pDefault=nullptr;
23 
XMLConversion(OBConversion * pConv)24   XMLConversion::XMLConversion(OBConversion* pConv)
25     : OBConversion(*pConv),
26       _requestedpos(0), _lastpos(0),
27       _reader(nullptr), _writer(nullptr),
28       _LookingForNamespace(false), _SkipNextRead(false)
29   {
30     _pConv = pConv;
31     pConv->SetAuxConv(this);//marks original OBConversion object as having been extended
32     SetAuxConv(this);//marks this new object as extended (for use with OBConversion pointer)
33   }
34 
SetupReader()35   bool XMLConversion::SetupReader()
36   {
37     if(_reader)
38       return true; //do not need to make a new reader
39 
40 	//setup libxml2 for use in a potentially multithreaded
41 	//environment
42 	xmlInitParser();
43 
44     //If the inputstream is not at the start (probably arising in fastsearch),
45     //save its position and rewind so that the reader initialization is ok.
46     //(Getting the requested object is handled in ReadXML(), when the format is known.)
47     _requestedpos = GetInStream()->tellg();
48     if(_requestedpos < 0)
49       _requestedpos = 0;
50     if(_requestedpos)
51       GetInStream()->seekg(0);
52 
53     //Set up a parser from an input stream
54     _reader = xmlReaderForIO(
55                              ReadStream, //xmlInputReadCallback (static member function)
56                              nullptr,    //xmlInputCloseCallback (static member function)
57                              this,       //context
58                              "",         //URL
59                              nullptr,    //encoding
60                              0);         //options
61 
62     if (_reader == nullptr)
63       {
64         cerr << "Cannot set up libxml2 reader" << endl;
65         return false;
66       }
67     //A new reader immediately reads 4 bytes (presumably to determine
68     //the encoding).
69     _lastpos = GetInStream()->tellg();
70     return true;
71   }
72 
SetupWriter()73   bool XMLConversion::SetupWriter()
74   {
75     //Set up XML writer if one does not already exist
76     if(_writer)
77       return true;
78 
79     _buf = xmlOutputBufferCreateIO  (
80                                      WriteStream, //xmlOutputWriteCallback
81                                      nullptr,     //xmlOutputCloseCallback
82                                      this,        //context
83                                      nullptr);    //xmlCharEncodingHandlerPtr
84     _writer = xmlNewTextWriter(_buf);
85 
86     if(!_buf || !_writer)
87       {
88         cerr << "Error setting up xml writer\n" << endl;
89         return false;
90       }
91 
92     int ret;
93     if(IsOption("c"))
94       ret = xmlTextWriterSetIndent(_writer,0);
95     else
96       {
97         ret = xmlTextWriterSetIndent(_writer,1);
98         ret = xmlTextWriterSetIndentString(_writer, BAD_CAST " ");
99       }
100     return ret==0;
101   }
102 
~XMLConversion()103   XMLConversion::~XMLConversion()
104   {
105     if(_reader) {
106       xmlFreeTextReader(_reader);
107       _reader = nullptr;
108     }
109     if(_writer) {
110 //      xmlTextWriterEndDocument(_writer); //if hasn't been called ealier
111         xmlFreeTextWriter(_writer);// was crashing
112         _writer = nullptr;
113     }
114     //xmlBufferFree(_buf);
115   }
116 
117   ///Called from each XML class during its construction
RegisterXMLFormat(XMLBaseFormat * pFormat,bool IsDefault,const char * uri)118   void XMLConversion::RegisterXMLFormat(XMLBaseFormat* pFormat, bool IsDefault, const char* uri)
119   {
120     if(IsDefault || Namespaces().empty())
121       _pDefault=pFormat;
122     if(uri)
123       Namespaces()[uri] = pFormat;
124     else
125       Namespaces()[pFormat->NamespaceURI()] = pFormat;
126   }
127 
128   ///Returns the extended form of the OBConversion object with an xml reader or writer,
129   /// if this has not already been done.
GetDerived(OBConversion * pConv,bool ForReading)130   XMLConversion* XMLConversion::GetDerived(OBConversion* pConv, bool ForReading)
131   {
132     XMLConversion* pxmlConv;
133     if(!pConv->GetAuxConv())
134       //Need to make an extended copy. It will be deleted by pConv's destructor
135       pxmlConv =  new XMLConversion(pConv);
136     else
137       {
138         //pConv has already had an extended copy made
139         *pConv->GetAuxConv() = *pConv; //ensure they have the same OBConversion data
140         pxmlConv = dynamic_cast<XMLConversion*>(pConv->GetAuxConv());
141         if (!pxmlConv)
142           return nullptr;
143       }
144 
145     if(ForReading)
146       {
147         streampos pos = pConv->GetInStream()->tellg();
148 
149         if(pos < pxmlConv->_lastpos || pxmlConv->_lastpos<0)
150           {
151             //Probably a new file; copy some member vars and renew the current reader
152             xmlFreeTextReader(pxmlConv->_reader); //need a new reader to read files with <?xml?>
153             pxmlConv->_reader = nullptr;
154             pxmlConv->InFilename = pConv->GetInFilename();
155             pxmlConv->pInFormat = pConv->GetInFormat();
156 
157           }
158         pxmlConv->SetupReader();
159       }
160     else
161     {
162       pxmlConv->SetupWriter();
163       pxmlConv->SetLast(pConv->IsLast()); //Copy IsLast flag to the extended object
164     }
165     return pxmlConv;
166   }
167 
168 
ReadXML(XMLBaseFormat * pFormat,OBBase * pOb)169   bool XMLConversion::ReadXML(XMLBaseFormat* pFormat, OBBase* pOb)
170   {
171     if(_requestedpos)
172       {
173         //The initial stream position was not at the start, probably because of fastsearch
174         //Read and discard the first object to synchronize the reader,
175         //then continue getting the requested object.
176         //Assumes the objects are all at the same level in the DOM tree.
177         SetOneObjectOnly(); //probably already set
178         streampos SavedReqestedPos = _requestedpos;
179         _requestedpos=0;//don't do this again
180         ReadXML(pFormat,pOb);
181         GetInStream()->seekg(SavedReqestedPos);
182       }
183 
184     //**Parse
185     int result=1;
186     unsigned elementCnt = 0;
187     while(!GetInStream()->bad() && !GetInStream()->eof() && (_SkipNextRead || (result=xmlTextReaderRead(_reader))==1)) //read may not be called
188     {
189       _SkipNextRead=false;
190       if(_LookingForNamespace)
191       {
192         const xmlChar* puri = xmlTextReaderConstNamespaceUri(_reader);
193         if(puri)
194           {
195             string uri((const char*)puri);
196             //Look up appropriate format class from the namespace URI
197             NsMapType::iterator nsiter;
198             nsiter = Namespaces().find(uri);
199             if(nsiter!=Namespaces().end())
200               {
201                 XMLBaseFormat* pNewFormat = nsiter->second;
202                 //Must have same target, e.g. OBMol, as current format
203                 if(pNewFormat->GetType() == pFormat->GetType())
204                   {
205                     _LookingForNamespace=false;
206                     _SkipNextRead=true;
207                     SetInFormat(pNewFormat);
208                     return pNewFormat->ReadMolecule(pOb,this);
209                   }
210               }
211           }
212       }
213 
214       const xmlChar* pname = xmlTextReaderConstLocalName(_reader);
215       int typ = xmlTextReaderNodeType(_reader);
216       if(typ==XML_READER_TYPE_SIGNIFICANT_WHITESPACE || !pname)
217         continue; //Text nodes handled in format class
218       string ElName((const char*)pname);
219 
220       //Pass the node on to the appropriate format class
221       bool ret;
222       if(typ==XML_READER_TYPE_ELEMENT)
223       {
224         elementCnt++;
225         ret= pFormat->DoElement(ElName);
226       }
227       else if(typ==XML_READER_TYPE_END_ELEMENT)
228         ret= pFormat->EndElement(ElName);
229       else
230         continue;
231       _lastpos = GetInStream()->tellg();
232 
233       if(!ret)
234         //derived format callback has stopped processing by returning false;
235         //leave reader intact so it can be continued to be used.
236         if(!IsOption("n",OBConversion::INOPTIONS))
237           {
238             _LookingForNamespace = true;
239             return true;
240           }
241     }
242 
243     if(result==-1)
244     {
245       xmlError* perr = xmlGetLastError();
246       if(perr && perr->level!=XML_ERR_NONE)
247         {
248           obErrorLog.ThrowError("XML Parser " + GetInFilename(),
249                                 perr->message, obError);
250         }
251       xmlResetError(perr);
252       GetInStream()->setstate(ios::eofbit);
253       return false;
254     }
255     else if(elementCnt == 0)
256     {
257       //didn't actually read any data (e.g., </cml> end tag)
258       return false;
259     }
260     return GetInStream()->good() && result!=0;
261   }
262 
263   /////////////////////////////////////////////////////////
264   ///Read and discard XML text up to the next occurrence of the tag e.g."/molecule>"
265   ///This is left as the current node. Returns 1 on success, 0 if not found, -1 if failed.
SkipXML(const char * ctag)266   int XMLConversion::SkipXML(const char* ctag)
267   {
268     string tag(ctag);
269     tag.erase(--tag.end()); //remove >
270     int targettyp = XML_READER_TYPE_ELEMENT;
271     if(tag[0]=='/')
272       {
273         tag.erase(0,1);
274         targettyp = XML_READER_TYPE_END_ELEMENT;
275       }
276 
277     int result;
278     while((result = xmlTextReaderRead(_reader))==1)
279       {
280         if(xmlTextReaderNodeType(_reader)==targettyp
281            && !xmlStrcmp(xmlTextReaderConstLocalName(_reader), BAD_CAST	tag.c_str()))
282           break;
283       }
284     return result;
285   }
286   /////////////////////////////////////////////////////////
GetAttribute(const char * attrname)287   string XMLConversion::GetAttribute(const char* attrname)
288   {
289     string AttributeValue;
290     xmlChar* pvalue  = xmlTextReaderGetAttribute(_reader, BAD_CAST attrname);
291     if(pvalue)
292     {
293       AttributeValue = (const char*)pvalue;
294       xmlFree(pvalue);
295     }
296     return AttributeValue;
297   }
298 
299   ////////////////////////////////////////////////////////
GetContent()300   string XMLConversion::GetContent()
301   {
302     xmlTextReaderRead(_reader);
303     const xmlChar* pvalue = xmlTextReaderConstValue(_reader);
304     string value((const char*)pvalue);
305     return Trim(value);
306   }
307 
308   ////////////////////////////////////////////////////////
GetContentInt(int & value)309   bool XMLConversion::GetContentInt(int& value)
310   {
311     xmlTextReaderRead(_reader);
312     const xmlChar* pvalue = xmlTextReaderConstValue(_reader);
313     if(!pvalue)
314       return false;
315     value = atoi((const char*)pvalue);
316     return true;
317   }
318 
319   ////////////////////////////////////////////////////////
GetContentDouble(double & value)320   bool XMLConversion::GetContentDouble(double& value)
321   {
322     xmlTextReaderRead(_reader);
323     const xmlChar* pvalue = xmlTextReaderConstValue(_reader);
324     if(!pvalue)
325       return false;
326     value = strtod((const char*)pvalue, nullptr);
327     return true;
328   }
329 
330   ////////////////////////////////////////////////////////
331   ///Static callback function for xmlReaderForIO(). Reads up to the next '>', or len chars.
332 
ReadStream(void * context,char * buffer,int len)333   int XMLConversion::ReadStream(void * context, char * buffer, int len)
334   {
335     //@todo worry about non-ascii coding
336     XMLConversion* pConv = static_cast<XMLConversion*>(context);
337     istream* ifs = pConv->GetInStream();
338 
339     if(!ifs->good() || ifs->eof())
340       return 0;
341 
342     ifs->get(buffer, len+1, '>');
343     streamsize count = strlen(buffer);
344 
345     if(ifs->peek()=='>')
346       {
347         ifs->ignore();
348         buffer[count] = '>';
349         buffer[++count] = '\0';
350       }
351 
352 		if (ifs->peek() == '\n' || ifs->peek() == '\r')
353 			{
354 				ifs->get(); // remove any trailing endlines
355 			}
356     return count;
357   }
358 
359   //////////////////////////////////////////////////////////
WriteStream(void * context,const char * buffer,int len)360   int XMLConversion::WriteStream(void * context, const char * buffer, int len)
361   {
362     XMLConversion* pxmlConv = static_cast<XMLConversion*>(context);
363     ostream* ofs = pxmlConv->GetOutStream();
364     if(len>0)                //a call with len=0 coming from xmlFreeTextWriter
365     {                        //called from destructor of XMLConversion was causing crash
366       ofs->write(buffer,len);
367       if(!ofs)
368         return -1;
369       ofs->flush();
370     }
371     return len;
372   }
373 
374 } //namespace OpenBabel
375 // http://xmlsoft.org/html/libxml-xmlreader.html
376 
377 /*
378 Programming notes on XML formats
379 
380 So that there would be no limitation of file sizes, the libxml2
381 reader was chosen. Rather than build a whole xml tree internally
382 as a DOM parser does, this provides callbacks when each element,
383 etc. is encountered (like SAX). Nevertheless it is aware of the
384 XML structure and will fail if it enounters irregular input. It
385 is therefore necessary to use a single instance of the reader for
386 each conversion process, rather than one for each object as would
387 have been more natural in OB (see below). This input process can
388 span multiple input files and is associated with the OBConversion
389 object - in particular the reader object is destroyed at the same
390 time as the OBConversion object. But it is not as simple as using
391 an extended OBConversion derived from the base class, because the
392 base OBConversion object has been constructed before the XML format
393 has been called. It might have been possible to have the reader as a
394 member variable in OBConversion, but that would make an undesirable
395 dependency for obconversion.cpp on the XML formats.
396 
397 The way it has been done maintains generality and no dependency.
398 OBConversion is given a member variable pAuxConv which is a pointer
399 to an OBConversion object. This is deleted in the OBConversion
400 destructor. By default pAuxConv is NULL.
401 XMLConversion is a class derived from OBConversion and
402 contains the interfacing with libxml2 for both reading and writing.
403 When a conversion involves an XML format, an instance of it is made
404 and pAuxConv in the original OBConversion is set to point to it.
405 This process is potentially extendable to allow other, as yet
406 unwritten, OBConversion extensions by having a chain of pointers
407 to derived OBConversion objects through their pAuxConv members,
408 with the last one being NULL.
409 
410 The design has to make sure that multi-object files are handled
411 in a way consistent with the rest of OpenBabel. This is based on
412 formats such as SMILES and MDL mol where the objects are just
413 concatenated. OpenBabel converts by reading one object at a time
414 from the input stream. The position in the input stream (obtained
415 from tellg) is also used to skip objects and as the index in fast
416 searching. These depend on input file position being left between
417 objects ready to read the next one.
418 
419 This causes some difficulty when using libxml2 as the XML parser
420 because it is a C application and does not have C++ input streams.
421 xmlReaderForIO is used which requests input data from the callback
422 routine XMLConversion::ReadStream(). This inputs chunks of characters
423 up to and including '>'. This ensures that the input stream is between
424 objects after an object has been parsed, ready for the next one.
425 
426 Parsing XML
427 At the start and end of each element the DoElement() and EndElement()
428 routines respectively in the format are called. The name of the element
429 is passed as a parameter and up to now it has been considered sufficient
430 to find the appropriate code using a set of if else statements. Only
431 those of interest need be handled. The attributes and content of the
432 element are found by calling libxml2 routines from within the format class.
433 Parsing is stopped and an object returned to OBConversion when false is
434 returned from DoElement or (more usually) from EndElement.
435 
436 Namespaces
437 XMLConversion class keeps a static map of xml namespaces and the classes
438 derived from XMLBaseFormat which implement them. It is populated on startup
439 by the format classes calling RegisterXMLFormat from their default constructors.
440 
441 When ReadChemObject() of a format class is called, the current format is set by
442 there and is used for all namespaces. So if CMLFormat is called it will find
443 all the molecules in a CMLReact file.
444 
445 When ReadChemObject() of the base class XMLFormat is the one called (e.g for files
446 with extension .xml), the initial current format is the default format(see below)
447 providing it handles the same chemical object type as the output format. The
448 ReadChemObject() of the default format is called and processing is as if the
449 default format was called, except that the first explicit namespace declaration
450 in the xml file that appears in the map can switch the handling to its associated
451 format.
452 
453 The default format is either the first class to register or one which identifies
454 itself as the default when calling RegisterXMLFormat().
455 
456 */
457 
458