1 /********************************************************************** 2 Copyright (C) 2005-2006 by Chris Morley 3 Some portions Copyright (C) 2006 by Geoffrey R. Hutchison 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation version 2 of the License. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 ***********************************************************************/ 14 #include <openbabel/babelconfig.h> 15 #include <openbabel/xml.h> 16 17 using namespace std; 18 namespace OpenBabel 19 { 20 21 //static variable 22 XMLBaseFormat* XMLConversion::_pDefault=nullptr; 23 XMLConversion(OBConversion * pConv)24 XMLConversion::XMLConversion(OBConversion* pConv) 25 : OBConversion(*pConv), 26 _requestedpos(0), _lastpos(0), 27 _reader(nullptr), _writer(nullptr), 28 _LookingForNamespace(false), _SkipNextRead(false) 29 { 30 _pConv = pConv; 31 pConv->SetAuxConv(this);//marks original OBConversion object as having been extended 32 SetAuxConv(this);//marks this new object as extended (for use with OBConversion pointer) 33 } 34 SetupReader()35 bool XMLConversion::SetupReader() 36 { 37 if(_reader) 38 return true; //do not need to make a new reader 39 40 //setup libxml2 for use in a potentially multithreaded 41 //environment 42 xmlInitParser(); 43 44 //If the inputstream is not at the start (probably arising in fastsearch), 45 //save its position and rewind so that the reader initialization is ok. 46 //(Getting the requested object is handled in ReadXML(), when the format is known.) 47 _requestedpos = GetInStream()->tellg(); 48 if(_requestedpos < 0) 49 _requestedpos = 0; 50 if(_requestedpos) 51 GetInStream()->seekg(0); 52 53 //Set up a parser from an input stream 54 _reader = xmlReaderForIO( 55 ReadStream, //xmlInputReadCallback (static member function) 56 nullptr, //xmlInputCloseCallback (static member function) 57 this, //context 58 "", //URL 59 nullptr, //encoding 60 0); //options 61 62 if (_reader == nullptr) 63 { 64 cerr << "Cannot set up libxml2 reader" << endl; 65 return false; 66 } 67 //A new reader immediately reads 4 bytes (presumably to determine 68 //the encoding). 69 _lastpos = GetInStream()->tellg(); 70 return true; 71 } 72 SetupWriter()73 bool XMLConversion::SetupWriter() 74 { 75 //Set up XML writer if one does not already exist 76 if(_writer) 77 return true; 78 79 _buf = xmlOutputBufferCreateIO ( 80 WriteStream, //xmlOutputWriteCallback 81 nullptr, //xmlOutputCloseCallback 82 this, //context 83 nullptr); //xmlCharEncodingHandlerPtr 84 _writer = xmlNewTextWriter(_buf); 85 86 if(!_buf || !_writer) 87 { 88 cerr << "Error setting up xml writer\n" << endl; 89 return false; 90 } 91 92 int ret; 93 if(IsOption("c")) 94 ret = xmlTextWriterSetIndent(_writer,0); 95 else 96 { 97 ret = xmlTextWriterSetIndent(_writer,1); 98 ret = xmlTextWriterSetIndentString(_writer, BAD_CAST " "); 99 } 100 return ret==0; 101 } 102 ~XMLConversion()103 XMLConversion::~XMLConversion() 104 { 105 if(_reader) { 106 xmlFreeTextReader(_reader); 107 _reader = nullptr; 108 } 109 if(_writer) { 110 // xmlTextWriterEndDocument(_writer); //if hasn't been called ealier 111 xmlFreeTextWriter(_writer);// was crashing 112 _writer = nullptr; 113 } 114 //xmlBufferFree(_buf); 115 } 116 117 ///Called from each XML class during its construction RegisterXMLFormat(XMLBaseFormat * pFormat,bool IsDefault,const char * uri)118 void XMLConversion::RegisterXMLFormat(XMLBaseFormat* pFormat, bool IsDefault, const char* uri) 119 { 120 if(IsDefault || Namespaces().empty()) 121 _pDefault=pFormat; 122 if(uri) 123 Namespaces()[uri] = pFormat; 124 else 125 Namespaces()[pFormat->NamespaceURI()] = pFormat; 126 } 127 128 ///Returns the extended form of the OBConversion object with an xml reader or writer, 129 /// if this has not already been done. GetDerived(OBConversion * pConv,bool ForReading)130 XMLConversion* XMLConversion::GetDerived(OBConversion* pConv, bool ForReading) 131 { 132 XMLConversion* pxmlConv; 133 if(!pConv->GetAuxConv()) 134 //Need to make an extended copy. It will be deleted by pConv's destructor 135 pxmlConv = new XMLConversion(pConv); 136 else 137 { 138 //pConv has already had an extended copy made 139 *pConv->GetAuxConv() = *pConv; //ensure they have the same OBConversion data 140 pxmlConv = dynamic_cast<XMLConversion*>(pConv->GetAuxConv()); 141 if (!pxmlConv) 142 return nullptr; 143 } 144 145 if(ForReading) 146 { 147 streampos pos = pConv->GetInStream()->tellg(); 148 149 if(pos < pxmlConv->_lastpos || pxmlConv->_lastpos<0) 150 { 151 //Probably a new file; copy some member vars and renew the current reader 152 xmlFreeTextReader(pxmlConv->_reader); //need a new reader to read files with <?xml?> 153 pxmlConv->_reader = nullptr; 154 pxmlConv->InFilename = pConv->GetInFilename(); 155 pxmlConv->pInFormat = pConv->GetInFormat(); 156 157 } 158 pxmlConv->SetupReader(); 159 } 160 else 161 { 162 pxmlConv->SetupWriter(); 163 pxmlConv->SetLast(pConv->IsLast()); //Copy IsLast flag to the extended object 164 } 165 return pxmlConv; 166 } 167 168 ReadXML(XMLBaseFormat * pFormat,OBBase * pOb)169 bool XMLConversion::ReadXML(XMLBaseFormat* pFormat, OBBase* pOb) 170 { 171 if(_requestedpos) 172 { 173 //The initial stream position was not at the start, probably because of fastsearch 174 //Read and discard the first object to synchronize the reader, 175 //then continue getting the requested object. 176 //Assumes the objects are all at the same level in the DOM tree. 177 SetOneObjectOnly(); //probably already set 178 streampos SavedReqestedPos = _requestedpos; 179 _requestedpos=0;//don't do this again 180 ReadXML(pFormat,pOb); 181 GetInStream()->seekg(SavedReqestedPos); 182 } 183 184 //**Parse 185 int result=1; 186 unsigned elementCnt = 0; 187 while(!GetInStream()->bad() && !GetInStream()->eof() && (_SkipNextRead || (result=xmlTextReaderRead(_reader))==1)) //read may not be called 188 { 189 _SkipNextRead=false; 190 if(_LookingForNamespace) 191 { 192 const xmlChar* puri = xmlTextReaderConstNamespaceUri(_reader); 193 if(puri) 194 { 195 string uri((const char*)puri); 196 //Look up appropriate format class from the namespace URI 197 NsMapType::iterator nsiter; 198 nsiter = Namespaces().find(uri); 199 if(nsiter!=Namespaces().end()) 200 { 201 XMLBaseFormat* pNewFormat = nsiter->second; 202 //Must have same target, e.g. OBMol, as current format 203 if(pNewFormat->GetType() == pFormat->GetType()) 204 { 205 _LookingForNamespace=false; 206 _SkipNextRead=true; 207 SetInFormat(pNewFormat); 208 return pNewFormat->ReadMolecule(pOb,this); 209 } 210 } 211 } 212 } 213 214 const xmlChar* pname = xmlTextReaderConstLocalName(_reader); 215 int typ = xmlTextReaderNodeType(_reader); 216 if(typ==XML_READER_TYPE_SIGNIFICANT_WHITESPACE || !pname) 217 continue; //Text nodes handled in format class 218 string ElName((const char*)pname); 219 220 //Pass the node on to the appropriate format class 221 bool ret; 222 if(typ==XML_READER_TYPE_ELEMENT) 223 { 224 elementCnt++; 225 ret= pFormat->DoElement(ElName); 226 } 227 else if(typ==XML_READER_TYPE_END_ELEMENT) 228 ret= pFormat->EndElement(ElName); 229 else 230 continue; 231 _lastpos = GetInStream()->tellg(); 232 233 if(!ret) 234 //derived format callback has stopped processing by returning false; 235 //leave reader intact so it can be continued to be used. 236 if(!IsOption("n",OBConversion::INOPTIONS)) 237 { 238 _LookingForNamespace = true; 239 return true; 240 } 241 } 242 243 if(result==-1) 244 { 245 xmlError* perr = xmlGetLastError(); 246 if(perr && perr->level!=XML_ERR_NONE) 247 { 248 obErrorLog.ThrowError("XML Parser " + GetInFilename(), 249 perr->message, obError); 250 } 251 xmlResetError(perr); 252 GetInStream()->setstate(ios::eofbit); 253 return false; 254 } 255 else if(elementCnt == 0) 256 { 257 //didn't actually read any data (e.g., </cml> end tag) 258 return false; 259 } 260 return GetInStream()->good() && result!=0; 261 } 262 263 ///////////////////////////////////////////////////////// 264 ///Read and discard XML text up to the next occurrence of the tag e.g."/molecule>" 265 ///This is left as the current node. Returns 1 on success, 0 if not found, -1 if failed. SkipXML(const char * ctag)266 int XMLConversion::SkipXML(const char* ctag) 267 { 268 string tag(ctag); 269 tag.erase(--tag.end()); //remove > 270 int targettyp = XML_READER_TYPE_ELEMENT; 271 if(tag[0]=='/') 272 { 273 tag.erase(0,1); 274 targettyp = XML_READER_TYPE_END_ELEMENT; 275 } 276 277 int result; 278 while((result = xmlTextReaderRead(_reader))==1) 279 { 280 if(xmlTextReaderNodeType(_reader)==targettyp 281 && !xmlStrcmp(xmlTextReaderConstLocalName(_reader), BAD_CAST tag.c_str())) 282 break; 283 } 284 return result; 285 } 286 ///////////////////////////////////////////////////////// GetAttribute(const char * attrname)287 string XMLConversion::GetAttribute(const char* attrname) 288 { 289 string AttributeValue; 290 xmlChar* pvalue = xmlTextReaderGetAttribute(_reader, BAD_CAST attrname); 291 if(pvalue) 292 { 293 AttributeValue = (const char*)pvalue; 294 xmlFree(pvalue); 295 } 296 return AttributeValue; 297 } 298 299 //////////////////////////////////////////////////////// GetContent()300 string XMLConversion::GetContent() 301 { 302 xmlTextReaderRead(_reader); 303 const xmlChar* pvalue = xmlTextReaderConstValue(_reader); 304 string value((const char*)pvalue); 305 return Trim(value); 306 } 307 308 //////////////////////////////////////////////////////// GetContentInt(int & value)309 bool XMLConversion::GetContentInt(int& value) 310 { 311 xmlTextReaderRead(_reader); 312 const xmlChar* pvalue = xmlTextReaderConstValue(_reader); 313 if(!pvalue) 314 return false; 315 value = atoi((const char*)pvalue); 316 return true; 317 } 318 319 //////////////////////////////////////////////////////// GetContentDouble(double & value)320 bool XMLConversion::GetContentDouble(double& value) 321 { 322 xmlTextReaderRead(_reader); 323 const xmlChar* pvalue = xmlTextReaderConstValue(_reader); 324 if(!pvalue) 325 return false; 326 value = strtod((const char*)pvalue, nullptr); 327 return true; 328 } 329 330 //////////////////////////////////////////////////////// 331 ///Static callback function for xmlReaderForIO(). Reads up to the next '>', or len chars. 332 ReadStream(void * context,char * buffer,int len)333 int XMLConversion::ReadStream(void * context, char * buffer, int len) 334 { 335 //@todo worry about non-ascii coding 336 XMLConversion* pConv = static_cast<XMLConversion*>(context); 337 istream* ifs = pConv->GetInStream(); 338 339 if(!ifs->good() || ifs->eof()) 340 return 0; 341 342 ifs->get(buffer, len+1, '>'); 343 streamsize count = strlen(buffer); 344 345 if(ifs->peek()=='>') 346 { 347 ifs->ignore(); 348 buffer[count] = '>'; 349 buffer[++count] = '\0'; 350 } 351 352 if (ifs->peek() == '\n' || ifs->peek() == '\r') 353 { 354 ifs->get(); // remove any trailing endlines 355 } 356 return count; 357 } 358 359 ////////////////////////////////////////////////////////// WriteStream(void * context,const char * buffer,int len)360 int XMLConversion::WriteStream(void * context, const char * buffer, int len) 361 { 362 XMLConversion* pxmlConv = static_cast<XMLConversion*>(context); 363 ostream* ofs = pxmlConv->GetOutStream(); 364 if(len>0) //a call with len=0 coming from xmlFreeTextWriter 365 { //called from destructor of XMLConversion was causing crash 366 ofs->write(buffer,len); 367 if(!ofs) 368 return -1; 369 ofs->flush(); 370 } 371 return len; 372 } 373 374 } //namespace OpenBabel 375 // http://xmlsoft.org/html/libxml-xmlreader.html 376 377 /* 378 Programming notes on XML formats 379 380 So that there would be no limitation of file sizes, the libxml2 381 reader was chosen. Rather than build a whole xml tree internally 382 as a DOM parser does, this provides callbacks when each element, 383 etc. is encountered (like SAX). Nevertheless it is aware of the 384 XML structure and will fail if it enounters irregular input. It 385 is therefore necessary to use a single instance of the reader for 386 each conversion process, rather than one for each object as would 387 have been more natural in OB (see below). This input process can 388 span multiple input files and is associated with the OBConversion 389 object - in particular the reader object is destroyed at the same 390 time as the OBConversion object. But it is not as simple as using 391 an extended OBConversion derived from the base class, because the 392 base OBConversion object has been constructed before the XML format 393 has been called. It might have been possible to have the reader as a 394 member variable in OBConversion, but that would make an undesirable 395 dependency for obconversion.cpp on the XML formats. 396 397 The way it has been done maintains generality and no dependency. 398 OBConversion is given a member variable pAuxConv which is a pointer 399 to an OBConversion object. This is deleted in the OBConversion 400 destructor. By default pAuxConv is NULL. 401 XMLConversion is a class derived from OBConversion and 402 contains the interfacing with libxml2 for both reading and writing. 403 When a conversion involves an XML format, an instance of it is made 404 and pAuxConv in the original OBConversion is set to point to it. 405 This process is potentially extendable to allow other, as yet 406 unwritten, OBConversion extensions by having a chain of pointers 407 to derived OBConversion objects through their pAuxConv members, 408 with the last one being NULL. 409 410 The design has to make sure that multi-object files are handled 411 in a way consistent with the rest of OpenBabel. This is based on 412 formats such as SMILES and MDL mol where the objects are just 413 concatenated. OpenBabel converts by reading one object at a time 414 from the input stream. The position in the input stream (obtained 415 from tellg) is also used to skip objects and as the index in fast 416 searching. These depend on input file position being left between 417 objects ready to read the next one. 418 419 This causes some difficulty when using libxml2 as the XML parser 420 because it is a C application and does not have C++ input streams. 421 xmlReaderForIO is used which requests input data from the callback 422 routine XMLConversion::ReadStream(). This inputs chunks of characters 423 up to and including '>'. This ensures that the input stream is between 424 objects after an object has been parsed, ready for the next one. 425 426 Parsing XML 427 At the start and end of each element the DoElement() and EndElement() 428 routines respectively in the format are called. The name of the element 429 is passed as a parameter and up to now it has been considered sufficient 430 to find the appropriate code using a set of if else statements. Only 431 those of interest need be handled. The attributes and content of the 432 element are found by calling libxml2 routines from within the format class. 433 Parsing is stopped and an object returned to OBConversion when false is 434 returned from DoElement or (more usually) from EndElement. 435 436 Namespaces 437 XMLConversion class keeps a static map of xml namespaces and the classes 438 derived from XMLBaseFormat which implement them. It is populated on startup 439 by the format classes calling RegisterXMLFormat from their default constructors. 440 441 When ReadChemObject() of a format class is called, the current format is set by 442 there and is used for all namespaces. So if CMLFormat is called it will find 443 all the molecules in a CMLReact file. 444 445 When ReadChemObject() of the base class XMLFormat is the one called (e.g for files 446 with extension .xml), the initial current format is the default format(see below) 447 providing it handles the same chemical object type as the output format. The 448 ReadChemObject() of the default format is called and processing is as if the 449 default format was called, except that the first explicit namespace declaration 450 in the xml file that appears in the map can switch the handling to its associated 451 format. 452 453 The default format is either the first class to register or one which identifies 454 itself as the default when calling RegisterXMLFormat(). 455 456 */ 457 458