1 /********************************************************************** 2 mmcifformat.cpp - Conversion to and from mmCIF format. 3 Copyright (C) Scarlet Line 2007 4 5 This file is part of the Open Babel project. 6 For more information, see <http://openbabel.org/> 7 8 This program is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation version 2 of the License. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 ***********************************************************************/ 17 18 #include <openbabel/babelconfig.h> 19 #include <openbabel/obmolecformat.h> 20 #include <openbabel/mol.h> 21 #include <openbabel/atom.h> 22 #include <openbabel/elements.h> 23 #include <openbabel/generic.h> 24 25 #include <openbabel/op.h> 26 27 #include <iostream> 28 #include <algorithm> 29 #include <ctype.h> 30 31 using namespace std; 32 namespace OpenBabel 33 { 34 static const string UNKNOWN_VALUE = "?"; 35 36 class mmCIFFormat : public OBMoleculeFormat 37 { 38 public: 39 //Register this format type ID mmCIFFormat()40 mmCIFFormat() 41 { // Copied from the Chemical MIME Page at http://www.ch.ic.ac.uk/chemime/ 42 OBConversion::RegisterFormat("mcif", this, "chemical/x-mmcif"); 43 OBConversion::RegisterFormat("mmcif", this, "chemical/x-mmcif"); 44 // Uncomment the following line, and this file will handle all CIF formats 45 // OBConversion::RegisterFormat("cif", this, "chemical/x-cif"); 46 47 OBConversion::RegisterOptionParam("s", this); 48 OBConversion::RegisterOptionParam("p", this); 49 OBConversion::RegisterOptionParam("b", this); 50 OBConversion::RegisterOptionParam("w", this); 51 } 52 Description()53 virtual const char* Description() //required 54 { 55 return 56 "Macromolecular Crystallographic Info\n " 57 "Read Options e.g. -as\n" 58 " s Output single bonds only\n" 59 " p Apply periodic boundary conditions for bonds\n" 60 " b Disable bonding entirely\n" 61 " w Wrap atomic coordinates into unit cell box\n\n"; 62 }; 63 SpecificationURL()64 virtual const char* SpecificationURL() 65 { return "http://mmcif.pdb.org/";}; //optional 66 // CIF itself is at http://www.iucr.org/iucr-top/cif/index.html 67 GetMIMEType()68 virtual const char* GetMIMEType() 69 { return "chemical/x-mmcif"; }; 70 71 //*** This section identical for most OBMol conversions *** 72 //////////////////////////////////////////////////// 73 /// The "API" interface functions 74 virtual int SkipObjects(int n, OBConversion* pConv); 75 virtual bool ReadMolecule(OBBase* pOb, OBConversion* pConv); 76 virtual bool WriteMolecule(OBBase* pOb, OBConversion* pConv); 77 }; 78 79 //Make an instance of the format class 80 mmCIFFormat themmCIFFormat; 81 82 struct CIFTagID 83 { 84 enum CIFCatName 85 { 86 unread_CIFCatName, 87 atom_site, 88 cell, 89 chemical, 90 chemical_formula, 91 symmetry, 92 symmetry_equiv, 93 space_group, 94 atom_type, 95 MAX_CIFCatName 96 }; 97 enum CIFDataName 98 { 99 unread_CIFDataName, 100 _atom_site_fract_x, // The x coordinate specified as a fraction of _cell_length_a 101 _atom_site_fract_y, // The y coordinate specified as a fraction of _cell_length_b 102 _atom_site_fract_z, // The z coordinate specified as a fraction of _cell_length_c 103 _atom_site_Cartn_x, // The x coordinate in angstroms 104 _atom_site_Cartn_y, // The y coordinate in angstroms 105 _atom_site_Cartn_z, // The z coordinate in angstroms 106 _atom_site_label, // The atomic label if more detailed label info unavailable 107 _atom_site_label_atom_id, // The atomic label within the residue 108 _atom_site_label_comp_id, // The residue abbreviation, e.g. ILE 109 _atom_site_label_entity_id, // The chain entity number of the residue, e.g. 2 110 _atom_site_label_asym_id, // The unique chain id 111 _atom_site_label_seq_id, // The sequence number of the residue, within the chain, e.g. 12 112 _atom_site_type_symbol, // Atomic symbol, e.g. C 113 _atom_site_occupancy, 114 MAX_atom_site, 115 _cell_length_a, // Unit-cell length a in Angstroms 116 _cell_length_b, // Unit-cell length b in Angstroms 117 _cell_length_c, // Unit-cell length c in Angstroms 118 _cell_angle_alpha, // Unit-cell angle alpha in degrees 119 _cell_angle_beta, // Unit-cell angle beta in degrees 120 _cell_angle_gamma, // Unit-cell angle gamma in degrees 121 MAX_cell, 122 // chemical name organized by increasing desirability 123 _chemical_name_common, 124 _chemical_name_structure_type, 125 _chemical_name_mineral, 126 _chemical_name_systematic, 127 MAX_chemical, 128 // chemical formulae organized by increasing desirability 129 _chemical_formula_moiety, 130 _chemical_formula_iupac, 131 _chemical_formula_structural, 132 _chemical_formula_analytical, 133 MAX_chemical_formula, 134 _symmetry_Int_Tables_number, 135 _symmetry_space_group_name_Hall, 136 _symmetry_space_group_name_H_M, 137 MAX_symmetry, 138 _symmetry_equiv_pos_as_xyz, 139 MAX_symmetry_equiv, 140 _space_group_IT_number, 141 _space_group_name_Hall, 142 _space_group_name_H_M_alt, 143 MAX_space_group, 144 _atom_type_symbol, 145 _atom_type_oxidation_number, 146 MAX_atom_type, 147 MAX_CIFDataName 148 }; 149 char tagname[76]; 150 CIFDataName tagid; 151 }; 152 typedef vector<CIFTagID::CIFDataName> CIFColumnList; 153 typedef map<string, CIFTagID::CIFDataName> CIFtagmap; 154 struct CIFResidueID 155 { 156 unsigned long ChainNum; // The number of the chain 157 unsigned long ResNum; // The number of the residue within the chain CIFResidueIDOpenBabel::CIFResidueID158 CIFResidueID() 159 {} CIFResidueIDOpenBabel::CIFResidueID160 CIFResidueID(unsigned long c, unsigned long r) 161 :ChainNum(c), ResNum(r) 162 {} CIFResidueIDOpenBabel::CIFResidueID163 CIFResidueID(const CIFResidueID & other) 164 :ChainNum(other.ChainNum), ResNum(other.ResNum) 165 {} operator =OpenBabel::CIFResidueID166 CIFResidueID & operator=(const CIFResidueID & other) 167 { 168 ChainNum = other.ChainNum; 169 ResNum = other.ResNum; 170 return (* this); 171 } operator <OpenBabel::CIFResidueID172 bool operator< (const CIFResidueID & other) const 173 { 174 return ( ChainNum < other.ChainNum ? true : ( other.ChainNum < ChainNum ? false : ResNum < other.ResNum ) ); 175 } 176 }; 177 typedef map<CIFResidueID, int> CIFResidueMap; 178 CIFtagmap CIFtagLookupTable; 179 180 CIFTagID CIFTagsRead[] = 181 { 182 { "_atom_site_fract_x", CIFTagID::_atom_site_fract_x }, 183 { "_atom_site_fract_y", CIFTagID::_atom_site_fract_y }, 184 { "_atom_site_fract_z", CIFTagID::_atom_site_fract_z }, 185 { "_atom_site_cartn_x", CIFTagID::_atom_site_Cartn_x }, 186 { "_atom_site_cartn_y", CIFTagID::_atom_site_Cartn_y }, 187 { "_atom_site_cartn_z", CIFTagID::_atom_site_Cartn_z }, 188 { "_atom_site_type_symbol", CIFTagID::_atom_site_type_symbol }, 189 { "_atom_site_occupancy", CIFTagID::_atom_site_occupancy}, 190 { "_atom_site_id", CIFTagID::_atom_site_label }, 191 { "_atom_site_label", CIFTagID::_atom_site_label }, 192 { "_atom_site_label_atom_id", CIFTagID::_atom_site_label_atom_id }, 193 { "_atom_site_label_comp_id", CIFTagID::_atom_site_label_comp_id }, 194 { "_atom_site_label_entity_id", CIFTagID::_atom_site_label_entity_id }, 195 { "_atom_site_label_seq_id", CIFTagID::_atom_site_label_seq_id }, 196 { "_atom_site_label_asym_id", CIFTagID::_atom_site_label_asym_id }, 197 { "_cell_length_a", CIFTagID::_cell_length_a }, 198 { "_cell_length_b", CIFTagID::_cell_length_b }, 199 { "_cell_length_c", CIFTagID::_cell_length_c }, 200 { "_cell_angle_alpha", CIFTagID::_cell_angle_alpha }, 201 { "_cell_angle_beta", CIFTagID::_cell_angle_beta }, 202 { "_cell_angle_gamma", CIFTagID::_cell_angle_gamma }, 203 { "_chemical_name_systematic", CIFTagID::_chemical_name_systematic }, 204 { "_chemical_name_mineral", CIFTagID::_chemical_name_mineral }, 205 { "_chemical_name_structure_type", CIFTagID::_chemical_name_structure_type }, 206 { "_chemical_name_common", CIFTagID::_chemical_name_common }, 207 { "_chemical_formula_analytical", CIFTagID::_chemical_formula_analytical }, 208 { "_chemical_formula_structural", CIFTagID::_chemical_formula_structural }, 209 { "_chemical_formula_iupac", CIFTagID::_chemical_formula_iupac }, 210 { "_chemical_formula_moiety", CIFTagID::_chemical_formula_moiety }, 211 { "_space_group_it_number", CIFTagID::_space_group_IT_number }, 212 { "_space_group_name_hall", CIFTagID::_space_group_name_Hall }, 213 { "_space_group_name_h-m_alt", CIFTagID::_space_group_name_H_M_alt }, 214 { "_symmetry_int_tables_number", CIFTagID::_symmetry_Int_Tables_number }, 215 { "_symmetry_space_group_name_hall", CIFTagID::_symmetry_space_group_name_Hall }, 216 { "_symmetry_space_group_name_h-m", CIFTagID::_symmetry_space_group_name_H_M }, 217 { "_symmetry_equiv_pos_as_xyz", CIFTagID::_symmetry_equiv_pos_as_xyz }, 218 { "_space_group_symop_operation_xyz", CIFTagID::_symmetry_equiv_pos_as_xyz }, 219 { "_atom_type_symbol", CIFTagID::_atom_type_symbol }, 220 { "_atom_type_oxidation_number",CIFTagID::_atom_type_oxidation_number }, 221 { "", CIFTagID::unread_CIFDataName } 222 }; 223 224 class CIFLexer 225 { 226 public: 227 enum TokenType 228 { 229 UnknownToken, 230 KeyDataToken, 231 KeyLoopToken, 232 KeySaveToken, 233 KeySaveEndToken, 234 KeyStopToken, 235 KeyGlobalToken, 236 TagToken, 237 ValueToken, 238 ValueOrKeyToken, 239 MAXTokenType 240 }; 241 struct Token 242 { 243 TokenType type; 244 string as_text; as_numberOpenBabel::CIFLexer::Token245 double as_number() const 246 { return strtod(as_text.c_str(), nullptr); } as_unsignedOpenBabel::CIFLexer::Token247 unsigned long as_unsigned() const 248 { return strtoul(as_text.c_str(), nullptr, 10); } 249 }; CIFLexer(std::istream * in)250 CIFLexer(std::istream * in) 251 :input(in) 252 { 253 last_char = 0; 254 next_char = input->get(); 255 } 256 bool next_token(CIFLexer::Token & token); 257 static CIFTagID::CIFDataName lookup_tag(const string & tag_name); 258 static CIFTagID::CIFCatName lookup_cat(CIFTagID::CIFDataName tagid); advance()259 void advance() 260 { 261 last_char = next_char; 262 next_char = input->get(); 263 } backup(size_t count)264 void backup(size_t count) 265 { 266 for ( ++ count; count; -- count ) 267 input->unget(); 268 last_char = 0; 269 next_char = input->get(); 270 } backup(size_t count,char next)271 void backup(size_t count, char next) 272 { 273 for ( ; count; -- count ) 274 input->unget(); 275 last_char = 0; 276 next_char = next; 277 } good() const278 bool good() const 279 { return input->good(); } 280 private: 281 istream * input; 282 int last_char, next_char; 283 }; lookup_tag(const string & tag_name)284 CIFTagID::CIFDataName CIFLexer::lookup_tag(const string & tag_name) 285 { 286 if (CIFtagLookupTable.empty()) 287 { 288 for (size_t idx = 0; CIFTagsRead[idx].tagid != CIFTagID::unread_CIFDataName; ++ idx) 289 { 290 CIFtagLookupTable.insert(CIFtagmap::value_type(string(CIFTagsRead[idx].tagname), CIFTagsRead[idx].tagid )); 291 } 292 } 293 CIFTagID::CIFDataName rtn = CIFTagID::unread_CIFDataName; 294 CIFtagmap::const_iterator found = CIFtagLookupTable.find(tag_name); 295 if (found != CIFtagLookupTable.end()) 296 rtn = (* found).second; 297 return rtn; 298 } lookup_cat(CIFTagID::CIFDataName tagid)299 CIFTagID::CIFCatName CIFLexer::lookup_cat(CIFTagID::CIFDataName tagid) 300 { 301 CIFTagID::CIFCatName catid = CIFTagID::unread_CIFCatName; 302 if (tagid > CIFTagID::unread_CIFDataName) 303 { 304 if (tagid < CIFTagID::MAX_atom_site) 305 catid = CIFTagID::atom_site; 306 else if (tagid < CIFTagID::MAX_cell) 307 catid = CIFTagID::cell; 308 else if (tagid < CIFTagID::MAX_chemical) 309 catid = CIFTagID::chemical; 310 else if (tagid < CIFTagID::MAX_chemical_formula) 311 catid = CIFTagID::chemical_formula; 312 else if (tagid < CIFTagID::MAX_symmetry) 313 catid = CIFTagID::symmetry; 314 else if (tagid < CIFTagID::MAX_symmetry_equiv) 315 catid = CIFTagID::symmetry_equiv; 316 else if (tagid < CIFTagID::MAX_space_group) 317 catid = CIFTagID::space_group; 318 else if (tagid < CIFTagID::MAX_atom_type) 319 catid = CIFTagID::atom_type; 320 } 321 return catid; 322 } 323 next_token(CIFLexer::Token & token)324 bool CIFLexer::next_token(CIFLexer::Token & token) 325 { 326 token.type = CIFLexer::UnknownToken; 327 token.as_text.clear(); 328 while (token.type == CIFLexer::UnknownToken && input->good()) 329 { 330 if (next_char <= ' ') 331 { // whitespace 332 advance(); 333 } 334 else 335 { // i.e. not WhiteSpace 336 switch(next_char) 337 { 338 // Comment handling 339 case '#': 340 do // eat comment to the end of the line 341 { 342 advance(); 343 } while (next_char != '\n' && input->good()); 344 // We are now pointing at EOL or EOF 345 break; 346 // Tag handling 347 case '_': 348 do // read name to the next whitespace 349 { 350 if (next_char == '.') // combines DDL1 and DDL2 tag names 351 next_char = '_'; 352 else 353 next_char = tolower(next_char); 354 token.as_text.push_back((char)next_char); 355 advance(); 356 } while (next_char > ' ' && input->good()); 357 // We are now pointing at the next whitespace 358 token.type = CIFLexer::TagToken; 359 break; 360 // Quoted data handling 361 case '"': 362 do // read name to the next quote-whitespace 363 { 364 advance(); 365 if (next_char == '"') 366 { 367 while (next_char == '"') 368 { 369 advance(); 370 if (next_char <= ' ') // whitespace 371 break; 372 token.as_text.push_back((char)last_char); 373 } 374 if (next_char <= ' ') // whitespace 375 break; 376 } 377 token.as_text.push_back((char)next_char); 378 } while (input->good()); 379 // We are now pointing at the next whitespace 380 token.type = CIFLexer::ValueToken; 381 break; 382 case '\'': 383 do // read name to the next quote-whitespace 384 { 385 advance(); 386 if (next_char == '\'') 387 { 388 while (next_char == '\'') 389 { 390 advance(); 391 if (next_char <= ' ') // whitespace 392 break; 393 token.as_text.push_back((char)last_char); 394 } 395 if (next_char <= ' ') // whitespace 396 break; 397 } 398 token.as_text.push_back((char)next_char); 399 } while (input->good()); 400 // We are now pointing at the next whitespace 401 token.type = CIFLexer::ValueToken; 402 break; 403 case ';': 404 if (last_char == '\n') 405 { 406 do // read name to the next <eol>-; 407 { 408 advance(); 409 if (next_char == '\n') 410 { 411 while (next_char == '\n') 412 { 413 advance(); 414 if (next_char == ';') // end 415 break; 416 token.as_text.push_back((char)last_char); 417 } 418 if (next_char == ';') // end 419 { 420 advance(); // go past the end 421 break; 422 } 423 } 424 token.as_text.push_back((char)next_char); 425 } while (input->good()); 426 // We are now pointing at the next whitespace 427 token.type = CIFLexer::ValueToken; 428 break; 429 } 430 // drop through to the default case 431 default: // reading an un-quoted text string 432 do // read text to the next whitespace 433 { 434 token.as_text.push_back((char)next_char); 435 advance(); 436 } while (next_char > ' ' && input->good()); 437 token.type = CIFLexer::ValueOrKeyToken; 438 // We are now pointing at the next whitespace 439 break; 440 } 441 } 442 } 443 if (token.type == CIFLexer::ValueOrKeyToken) 444 { 445 string::size_type len = token.as_text.size(); 446 if (len == 1 && token.as_text[0] == '.') 447 token.type = CIFLexer::ValueToken; 448 else if (!strncasecmp(token.as_text.c_str(), "data_", 5)) 449 { 450 token.type = CIFLexer::KeyDataToken; 451 token.as_text.erase(0, 5); 452 } 453 else if (!strcasecmp(token.as_text.c_str(), "loop_")) 454 token.type = CIFLexer::KeyLoopToken; 455 else if (!strncasecmp(token.as_text.c_str(), "save_", 5)) 456 { 457 if (len == 5) 458 { 459 token.type = CIFLexer::KeySaveEndToken; 460 } 461 else 462 { 463 token.type = CIFLexer::KeySaveToken; 464 token.as_text.erase(0, 5); 465 } 466 } 467 else if (!strcasecmp(token.as_text.c_str(), "stop_")) 468 token.type = CIFLexer::KeyStopToken; 469 else if (!strcasecmp(token.as_text.c_str(), "global_")) 470 token.type = CIFLexer::KeyGlobalToken; 471 else 472 token.type = CIFLexer::ValueToken; 473 } 474 return token.type != CIFLexer::UnknownToken; 475 } 476 ///////////////////////////////////////////////////////////////// SkipObjects(int n,OBConversion * pConv)477 int mmCIFFormat::SkipObjects(int n, OBConversion* pConv) 478 { 479 if (n == 0) 480 ++ n; 481 CIFLexer lexer(pConv->GetInStream()); 482 CIFLexer::Token token; 483 while (n && lexer.good()) 484 { 485 while ( lexer.next_token(token) && token.type != CIFLexer::KeyDataToken); 486 -- n; 487 } 488 if (lexer.good()) 489 lexer.backup(5 + token.as_text.size(), 'd'); // length of "data_<name>" 490 491 return lexer.good() ? 1 : -1; 492 } ReadMolecule(OBBase * pOb,OBConversion * pConv)493 bool mmCIFFormat::ReadMolecule(OBBase* pOb, OBConversion* pConv) 494 { 495 OBMol* pmol = pOb->CastAndClear<OBMol>(); 496 if (pmol == nullptr) 497 return false; 498 499 CIFLexer lexer(pConv->GetInStream()); 500 CIFLexer::Token token; 501 502 typedef map<string, unsigned> CIFasymmap; 503 CIFasymmap asym_map; 504 string last_asym_id = ""; 505 unsigned next_asym_no = 0; 506 bool has_residue_information = false; 507 508 pmol->SetChainsPerceived(); // avoid perception if we are setting residues 509 510 bool wrap_coords = pConv->IsOption("w",OBConversion::INOPTIONS); 511 512 // move to the next data block (i.e. molecule, we hope ) 513 while (lexer.next_token(token) && token.type != CIFLexer::KeyDataToken); 514 if (token.type == CIFLexer::KeyDataToken) 515 { // we have found the next data block: 516 pmol->BeginModify(); 517 pmol->SetTitle(token.as_text); 518 bool finished = false, token_peeked = false; 519 double cell_a = 1.0, cell_b = 1.0, cell_c = 1.0; 520 double cell_alpha = 90.0, cell_beta = 90.0, cell_gamma = 90.0; 521 CIFTagID::CIFDataName name_tag = CIFTagID::unread_CIFDataName; 522 CIFTagID::CIFDataName formula_tag = CIFTagID::unread_CIFDataName; 523 int use_cell = 0, use_fract = 0; 524 string space_group_name("P1"); 525 SpaceGroup space_group; 526 bool space_group_failed = false; 527 std::map<string, double> atomic_charges; 528 while (!finished && (token_peeked || lexer.next_token(token))) 529 { 530 token_peeked = false; 531 switch (token.type) 532 { 533 case CIFLexer::KeyGlobalToken: 534 // We have come to the next block: 535 if (pmol->NumAtoms() > 0) 536 { // Found a molecule, so finished 537 finished = true; 538 // move back to the start of the global block: 539 lexer.backup(token.as_text.size(), 'g'); // length of "global_" 540 } 541 else // not yet found a molecule, so go to the next data block 542 { 543 while (lexer.next_token(token) && token.type != CIFLexer::KeyDataToken); 544 if (token.type == CIFLexer::KeyDataToken) 545 { // we have found the next data block: 546 pmol->SetTitle(token.as_text); 547 } 548 } 549 break; 550 case CIFLexer::KeyDataToken: 551 // We have come to the next data block: 552 if (pmol->NumAtoms() > 0) 553 { // Found a molecule, so finished 554 finished = true; 555 // move back to the start of the data block: 556 lexer.backup(5 + token.as_text.size(), 'd'); // length of "data_<name>" 557 } 558 else // not yet found a molecule, so try again 559 pmol->SetTitle(token.as_text); 560 break; 561 case CIFLexer::KeySaveToken: 562 { // Simply eat tokens until the save_ ending token 563 while (lexer.next_token(token) && token.type != CIFLexer::KeySaveEndToken); 564 } 565 break; 566 case CIFLexer::KeyLoopToken: 567 { 568 CIFColumnList columns; 569 CIFTagID::CIFCatName catid = CIFTagID::unread_CIFCatName; 570 while ( (token_peeked = lexer.next_token(token)) == true && token.type == CIFLexer::TagToken) 571 { // Read in the tags 572 CIFTagID::CIFDataName tagid = lexer.lookup_tag(token.as_text); 573 columns.push_back(tagid); 574 if (catid == CIFTagID::unread_CIFCatName && tagid != CIFTagID::unread_CIFDataName) 575 catid = lexer.lookup_cat(tagid); 576 } 577 size_t column_count = columns.size(); 578 switch (catid) 579 { 580 case CIFTagID::atom_site: 581 { 582 int use_cartn = 0, use_residue = 0; 583 use_fract = 0; 584 CIFTagID::CIFDataName atom_type_tag = CIFTagID::unread_CIFDataName; 585 for (CIFColumnList::const_iterator colx = columns.begin(), coly = columns.end(); colx != coly; ++ colx) 586 { 587 switch (* colx) 588 { 589 case CIFTagID::_atom_site_Cartn_x: 590 case CIFTagID::_atom_site_Cartn_y: 591 case CIFTagID::_atom_site_Cartn_z: 592 ++ use_cartn; 593 break; 594 case CIFTagID::_atom_site_fract_x: 595 case CIFTagID::_atom_site_fract_y: 596 case CIFTagID::_atom_site_fract_z: 597 ++ use_fract; 598 break; 599 case CIFTagID::_atom_site_label_comp_id: 600 case CIFTagID::_atom_site_label_seq_id: 601 ++ use_residue; 602 break; 603 case CIFTagID::_atom_site_type_symbol: 604 case CIFTagID::_atom_site_label_atom_id: 605 case CIFTagID::_atom_site_label: 606 if (atom_type_tag < (* colx)) 607 atom_type_tag = (* colx); 608 break; 609 default: 610 break; 611 } 612 } 613 if (use_cartn) 614 { 615 for (CIFColumnList::iterator colx = columns.begin(), coly = columns.end(); colx != coly; ++ colx) 616 if ( (* colx) >= CIFTagID::_atom_site_fract_x && (* colx) <= CIFTagID::_atom_site_fract_z) 617 (* colx) = CIFTagID::unread_CIFDataName; 618 use_fract = 0; 619 } 620 size_t column_idx = 0; 621 OBAtom * atom = nullptr; 622 double x = 0.0, y = 0.0, z = 0.0; 623 CIFResidueMap ResidueMap; 624 unsigned long chain_num = 1, residue_num = 1; 625 unsigned int nbc=0; 626 string residue_name, atom_label, atom_mol_label, tmpSymbol; 627 int atomicNum; 628 OBPairData *label; 629 while (token.type == CIFLexer::ValueToken) // Read in the Fields 630 { 631 if (column_idx == 0) 632 { 633 atom = pmol->NewAtom(); 634 x = y = z = 0.0; 635 } 636 switch (columns[column_idx]) 637 { 638 case CIFTagID::_atom_site_label: // The atomic label within the molecule 639 label = new OBPairData; 640 label->SetAttribute("_atom_site_label"); 641 label->SetValue(token.as_text); 642 label->SetOrigin(fileformatInput); 643 atom->SetData(label); 644 atom_mol_label.assign(token.as_text); 645 646 if (atom_type_tag != CIFTagID::_atom_site_label) 647 break; 648 // Else remove everything starting from the first digit 649 // and drop through to _atom_site_type_symbol 650 if(string::npos != token.as_text.find_first_of("0123456789")) 651 {token.as_text.erase(token.as_text.find_first_of("0123456789"), token.as_text.size());} 652 case CIFTagID::_atom_site_type_symbol: 653 // Problem: posat->mSymbol is not guaranteed to actually be a 654 // symbol see http://www.iucr.org/iucr-top/cif/cifdic_html/1/cif_core.dic/Iatom_type_symbol.html 655 // Try to strip the string to have a better chance to have a 656 // valid symbol 657 // This is not guaranteed to work still, as the CIF standard 658 // allows about any string... 659 tmpSymbol=token.as_text.c_str(); 660 if ((tmpSymbol.size()==1) && isalpha(tmpSymbol[0])) 661 { 662 nbc=1; 663 } 664 else if (tmpSymbol.size()>=2) 665 { 666 if (isalpha(tmpSymbol[0]) && isalpha(tmpSymbol[1])) 667 { 668 nbc=2; 669 } 670 else if (isalpha(tmpSymbol[0])) 671 { 672 nbc=1; 673 } 674 } 675 else 676 { 677 nbc = 0; 678 } 679 if (tmpSymbol.size()>nbc) 680 {// Try to find a formal charge in the symbol 681 int charge=0; 682 int sign=0; 683 for(unsigned int i=nbc;i<tmpSymbol.size();++i) 684 {// Use first number found as formal charge 685 if (isdigit(tmpSymbol[i]) && (charge==0)) 686 { 687 charge=atoi(tmpSymbol.substr(i,1).c_str()); 688 } 689 if ('-'==tmpSymbol[i]) 690 { 691 sign-=1; 692 } 693 if ('+'==tmpSymbol[i]) 694 { 695 sign+=1; 696 } 697 } 698 if (0!=sign) // no sign, no charge 699 { 700 if (charge==0) 701 { 702 charge=1; 703 } 704 stringstream ss; 705 ss<< tmpSymbol <<" / symbol="<<tmpSymbol.substr(0,nbc) 706 <<" charge= "<<sign*charge; 707 obErrorLog.ThrowError(__FUNCTION__, ss.str(), obDebug); 708 atom->SetFormalCharge(sign*charge); 709 } 710 } 711 if (nbc>0) 712 { 713 tmpSymbol=tmpSymbol.substr(0,nbc); 714 } 715 else 716 { 717 stringstream ss; 718 ss<< tmpSymbol <<" / could not derive a symbol" 719 <<" for atomic number. Setting it to default " 720 <<" Xx(atomic number 0)"; 721 obErrorLog.ThrowError(__FUNCTION__, ss.str(), obDebug); 722 tmpSymbol="Xx";//Something went wrong, no symbol ! Default to Xx 723 } 724 atomicNum = OBElements::GetAtomicNum(tmpSymbol.c_str()); 725 // Test for some oxygens with subscripts 726 if (atomicNum == 0 && tmpSymbol[0] == 'O') 727 { 728 atomicNum = 8; // e.g. Ob, OH, etc. 729 } 730 731 atom->SetAtomicNum(atomicNum); //set atomic number, or '0' if the atom type is not recognized 732 atom->SetType(tmpSymbol); 733 break; 734 case CIFTagID::_atom_site_fract_x: 735 case CIFTagID::_atom_site_Cartn_x: 736 x = token.as_number(); 737 break; 738 case CIFTagID::_atom_site_fract_y: 739 case CIFTagID::_atom_site_Cartn_y: 740 y = token.as_number(); 741 break; 742 case CIFTagID::_atom_site_fract_z: 743 case CIFTagID::_atom_site_Cartn_z: 744 z = token.as_number(); 745 break; 746 case CIFTagID::_atom_site_label_atom_id: // The atomic label within the residue 747 atom_label.assign(token.as_text); 748 if (atom_type_tag == CIFTagID::_atom_site_label_atom_id) 749 { 750 for (string::iterator posx = token.as_text.begin(), posy = token.as_text.end(); posx != posy; ++ posx) 751 { 752 char c = (char)toupper(* posx); 753 if ( c < 'A' || c > 'Z' ) 754 { 755 token.as_text.erase(posx, posy); 756 break; 757 } 758 } 759 atom->SetAtomicNum(OBElements::GetAtomicNum(token.as_text.c_str())); 760 atom->SetType(token.as_text); 761 } 762 break; 763 case CIFTagID::_atom_site_label_comp_id: // The residue abbreviation, e.g. ILE 764 residue_name.assign(token.as_text); 765 break; 766 case CIFTagID::_atom_site_label_entity_id: // The chain entity number of the residue, e.g. 2 767 // ignored and replaced by unique id for label_asym_id 768 break; 769 case CIFTagID::_atom_site_label_asym_id: // The strand number of the residue 770 if (token.as_text != last_asym_id) { 771 CIFasymmap::const_iterator asym_it = asym_map.find(token.as_text); 772 if (asym_it == asym_map.end()) { 773 ++next_asym_no; 774 asym_it = 775 asym_map.insert(CIFasymmap::value_type(token.as_text, 776 next_asym_no)).first; 777 } 778 chain_num = asym_it->second; 779 last_asym_id = token.as_text; 780 } 781 break; 782 case CIFTagID::_atom_site_label_seq_id: // The sequence number of the residue, within the chain, e.g. 12 783 residue_num = token.as_unsigned(); 784 break; 785 case CIFTagID::_atom_site_occupancy: // The occupancy of the site. 786 { 787 OBPairFloatingPoint * occup = new OBPairFloatingPoint; 788 occup->SetAttribute("_atom_site_occupancy"); 789 double occupancy = std::max(0.0, std::min(1.0, token.as_number())); // clamp occupancy to [0.0, 1.0] bugfix 790 occup->SetValue(occupancy); 791 occup->SetOrigin(fileformatInput); 792 atom->SetData(occup); 793 } 794 break; 795 case CIFTagID::unread_CIFDataName: 796 default: 797 break; 798 } 799 ++ column_idx; 800 if (column_idx == column_count) 801 { 802 atom->SetVector(x, y, z); 803 if (use_residue == 2) 804 { 805 has_residue_information = true; 806 CIFResidueID res_id(chain_num, residue_num); 807 CIFResidueMap::const_iterator resx = ResidueMap.find(res_id); 808 OBResidue * res; 809 if (resx == ResidueMap.end()) 810 { 811 ResidueMap[res_id] = pmol->NumResidues(); 812 res = pmol->NewResidue(); 813 res->SetChainNum(chain_num); 814 res->SetNum(residue_num); 815 res->SetName(residue_name); 816 } 817 else 818 res = pmol->GetResidue( (* resx).second ); 819 res->AddAtom(atom); 820 if (!atom_label.empty()) 821 res->SetAtomID(atom, atom_label); 822 unsigned long serial_no = strtoul(atom_mol_label.c_str(), nullptr, 10); 823 if (serial_no > 0) 824 res->SetSerialNum(atom, serial_no); 825 } 826 column_idx = 0; 827 } 828 token_peeked = lexer.next_token(token); 829 } 830 } 831 break; 832 case CIFTagID::symmetry_equiv: 833 { 834 size_t column_idx = 0; 835 while (token.type == CIFLexer::ValueToken) // Read in the Fields 836 { 837 if ((columns[column_idx] == CIFTagID::_symmetry_equiv_pos_as_xyz) 838 && token.as_text.find(UNKNOWN_VALUE) == string::npos) 839 space_group.AddTransform(token.as_text); 840 ++ column_idx; 841 if (column_idx == column_count) 842 column_idx = 0; 843 token_peeked = lexer.next_token(token); 844 } 845 } 846 break; 847 848 case CIFTagID::atom_type: //Atoms oxidations 849 { 850 size_t column_idx = 0; 851 string atom_label = ""; 852 double charge = 0; 853 while (token.type == CIFLexer::ValueToken) // Read in the Fields 854 { 855 if (columns[column_idx] == CIFTagID::_atom_type_symbol) 856 atom_label = token.as_text; 857 if (columns[column_idx] == CIFTagID::_atom_type_oxidation_number) 858 charge = token.as_number(); 859 ++ column_idx; 860 if (column_idx == column_count) 861 { 862 atomic_charges[atom_label] = charge; 863 column_idx = 0; 864 } 865 token_peeked = lexer.next_token(token); 866 } 867 } 868 break; 869 870 871 case CIFTagID::unread_CIFCatName: 872 default: 873 while (token.type == CIFLexer::ValueToken) // Eat the values, we don't want them 874 token_peeked = lexer.next_token(token); 875 break; 876 } 877 } 878 break; 879 case CIFLexer::TagToken: 880 { 881 CIFTagID::CIFDataName tag_id = lexer.lookup_tag(token.as_text); 882 // get the value 883 lexer.next_token(token); 884 switch (tag_id) 885 { 886 case CIFTagID::_cell_length_a: 887 cell_a = token.as_number(); 888 ++ use_cell; 889 break; 890 case CIFTagID::_cell_length_b: 891 cell_b = token.as_number(); 892 ++ use_cell; 893 break; 894 case CIFTagID::_cell_length_c: 895 cell_c = token.as_number(); 896 ++ use_cell; 897 break; 898 case CIFTagID::_cell_angle_alpha: 899 cell_alpha = token.as_number(); 900 ++ use_cell; 901 break; 902 case CIFTagID::_cell_angle_beta: 903 cell_beta = token.as_number(); 904 ++ use_cell; 905 break; 906 case CIFTagID::_cell_angle_gamma: 907 cell_gamma = token.as_number(); 908 ++ use_cell; 909 break; 910 case CIFTagID::_chemical_name_systematic: 911 case CIFTagID::_chemical_name_mineral: 912 case CIFTagID::_chemical_name_structure_type: 913 case CIFTagID::_chemical_name_common: 914 if (tag_id > name_tag) 915 { 916 name_tag = tag_id; 917 pmol->SetTitle(token.as_text); 918 } 919 break; 920 case CIFTagID::_chemical_formula_analytical: 921 case CIFTagID::_chemical_formula_structural: 922 case CIFTagID::_chemical_formula_iupac: 923 case CIFTagID::_chemical_formula_moiety: 924 if (tag_id > formula_tag) 925 { 926 formula_tag = tag_id; 927 pmol->SetFormula(token.as_text); 928 } 929 break; 930 case CIFTagID::_space_group_IT_number: 931 case CIFTagID::_symmetry_Int_Tables_number: 932 space_group_name.assign(token.as_text); 933 space_group.SetId(atoi(space_group_name.c_str())); 934 break; 935 case CIFTagID::_space_group_name_Hall: 936 case CIFTagID::_symmetry_space_group_name_Hall: 937 space_group_name.assign(token.as_text); 938 space_group.SetHallName(space_group_name.c_str()); 939 break; 940 case CIFTagID::_space_group_name_H_M_alt: 941 case CIFTagID::_symmetry_space_group_name_H_M: 942 space_group_name.assign(token.as_text); 943 space_group.SetHMName(space_group_name.c_str()); 944 break; 945 case CIFTagID::_symmetry_equiv_pos_as_xyz: 946 space_group.AddTransform(token.as_text); 947 break; 948 default: // eat the value for this tag 949 break; 950 } 951 } 952 break; 953 case CIFLexer::KeyStopToken: 954 case CIFLexer::ValueToken: 955 default: 956 break; 957 } 958 } 959 if (pmol->NumAtoms() > 0) 960 { 961 if (use_cell >= 6) 962 { 963 OBUnitCell * pCell = new OBUnitCell; // No matching "delete" because it's saved in pmol->SetData 964 pCell->SetOrigin(fileformatInput); 965 pCell->SetData(cell_a, cell_b, cell_c, 966 cell_alpha, 967 cell_beta, 968 cell_gamma 969 ); 970 pCell->SetSpaceGroup(space_group_name); 971 const SpaceGroup * pSpaceGroup = SpaceGroup::Find( & space_group); 972 if (pSpaceGroup) 973 pCell->SetSpaceGroup(pSpaceGroup); 974 else 975 space_group_failed = true; 976 pmol->SetData(pCell); 977 if (use_fract) 978 { 979 for (OBAtomIterator atom_x = pmol->BeginAtoms(), atom_y = pmol->EndAtoms(); atom_x != atom_y; ++ atom_x) 980 { 981 OBAtom * atom = (* atom_x); 982 if (wrap_coords) 983 atom->SetVector(pCell->FractionalToCartesian( 984 pCell->WrapFractionalCoordinate(atom->GetVector()))); 985 else 986 atom->SetVector(pCell->FractionalToCartesian(atom->GetVector())); 987 } // Note: this is where we could keep the original fractional coordinates, e.g. in a new OBCoord class 988 } 989 if (pConv->IsOption("p",OBConversion::INOPTIONS)) 990 pmol->SetPeriodicMol(); 991 } 992 for (OBAtomIterator atom_x = pmol->BeginAtoms(), atom_y = pmol->EndAtoms(); atom_x != atom_y; ++atom_x ) 993 { 994 OBAtom * atom = (* atom_x); 995 OBPairData * pd = dynamic_cast<OBPairData *>( atom->GetData( "_atom_site_label" ) ); 996 if (pd != nullptr) 997 { 998 if( atomic_charges.count( pd->GetValue() ) > 0 ) 999 { 1000 OBPairFloatingPoint * charge_obd = new OBPairFloatingPoint; 1001 charge_obd->SetAttribute("input_charge"); 1002 charge_obd->SetValue(atomic_charges[pd->GetValue()] ); 1003 charge_obd->SetOrigin(fileformatInput); 1004 atom->SetData(charge_obd); 1005 } 1006 } 1007 } 1008 1009 if (!pConv->IsOption("b",OBConversion::INOPTIONS)) 1010 { 1011 pmol->ConnectTheDots(); 1012 if (!pConv->IsOption("s",OBConversion::INOPTIONS)) 1013 pmol->PerceiveBondOrders(); 1014 } 1015 } 1016 1017 if (space_group_failed) 1018 { 1019 string transformations; 1020 transform3dIterator ti; 1021 const transform3d *t = space_group.BeginTransform(ti); 1022 while(t){ 1023 transformations += t->DescribeAsString() + " "; 1024 t = space_group.NextTransform(ti); 1025 } 1026 1027 OBOp* pOp = OBOp::FindType("fillUC"); 1028 if (pOp && transformations.length()) 1029 { 1030 map<string, string> m; 1031 m.insert(pair<string, string>("transformations", transformations)); 1032 pOp->Do(pmol, "strict", &m); 1033 } 1034 } 1035 1036 pmol->EndModify(); 1037 } 1038 if (has_residue_information) 1039 pmol->SetChainsPerceived(); 1040 return (pmol->NumAtoms() > 0 ? true : false); 1041 } 1042 1043 //////////////////////////////////////////////////////////////// 1044 WriteMolecule(OBBase * pOb,OBConversion * pConv)1045 bool mmCIFFormat::WriteMolecule(OBBase* pOb, OBConversion* pConv) 1046 { 1047 OBMol* pmol = dynamic_cast<OBMol*>(pOb); 1048 if (pmol == nullptr) 1049 return false; 1050 1051 //Define some references so we can use the old parameter names 1052 ostream & ofs = * pConv->GetOutStream(); 1053 1054 char buffer[BUFF_SIZE]; 1055 1056 string id; 1057 for (const char * p = pmol->GetTitle(); * p; ++ p) 1058 if ( (* p) > ' ' && (* p) <= '~' ) 1059 id.append(1, (char)toupper(* p)); 1060 if (id.empty()) 1061 { 1062 snprintf(buffer, BUFF_SIZE, "T%lu", (unsigned long)time(nullptr)); 1063 id.assign(buffer); 1064 } 1065 ofs << "# --------------------------------------------------------------------------" << endl; 1066 ofs << "#" << endl; 1067 ofs << "# CIF file generated by openbabel " << BABEL_VERSION << " http://openbabel.org/" << endl; 1068 ofs << "# to comply with the Macromolecular CIF Dictionary (cif_mm.dic) version 2.0.11 http://mmcif.pdb.org/" << endl; 1069 ofs << "# The contents of this file were derived from " << pConv->GetInFilename() << endl; 1070 ofs << "#" << endl; 1071 ofs << "#---------------------------------------------------------------------------" << endl; 1072 ofs << endl; 1073 ofs << "data_" << id << endl; 1074 ofs << endl; 1075 ofs << "###########" << endl; 1076 ofs << "## ENTRY ##" << endl; 1077 ofs << "###########" << endl; 1078 ofs << endl; 1079 ofs << "_entry.id\t" << id << endl; 1080 ofs << endl; 1081 if (* (pmol->GetTitle())) 1082 { 1083 ofs << "##############" << endl; 1084 ofs << "## CHEMICAL ##" << endl; 1085 ofs << "##############" << endl; 1086 ofs << endl; 1087 ofs << "_chemical.entry_id\t" << id << endl; 1088 ofs << "_chemical.name_common\t'" << pmol->GetTitle() << "'" << endl; 1089 ofs << endl; 1090 } 1091 if (! (pmol->GetSpacedFormula().empty())) 1092 { 1093 ofs << "######################" << endl; 1094 ofs << "## CHEMICAL FORMULA ##" << endl; 1095 ofs << "######################" << endl; 1096 ofs << endl; 1097 ofs << "_chemical_formula.entry_id\t" << id << endl; 1098 ofs << "_chemical_formula.structural\t'" << pmol->GetFormula() << "'" << endl; 1099 ofs << endl; 1100 } 1101 ofs << "###############" << endl; 1102 ofs << "## ATOM_SITE ##" << endl; 1103 ofs << "###############" << endl; 1104 ofs << endl; 1105 ofs << "loop_" << endl; 1106 ofs << "_atom_site.id" << endl; 1107 ofs << "_atom_site.type_symbol" << endl; 1108 bool has_residues = (pmol->NumResidues() > 0); 1109 if (has_residues) 1110 { 1111 ofs << "_atom_site.label_atom_id" << endl; 1112 ofs << "_atom_site.label_comp_id" << endl; 1113 ofs << "_atom_site.label_entity_id" << endl; 1114 ofs << "_atom_site.label_seq_id" << endl; 1115 } 1116 ofs << "_atom_site.Cartn_x" << endl; 1117 ofs << "_atom_site.Cartn_y" << endl; 1118 ofs << "_atom_site.Cartn_z" << endl; 1119 size_t site_id = 1; 1120 for (OBAtomIterator atom_x = pmol->BeginAtoms(), atom_y = pmol->EndAtoms(); atom_x != atom_y; ++ atom_x, ++ site_id) 1121 { 1122 OBAtom * atom = (* atom_x); 1123 ofs << '\t' << site_id << '\t' << OBElements::GetSymbol(atom->GetAtomicNum()); 1124 if (has_residues) 1125 { 1126 OBResidue * pRes = atom->GetResidue(); 1127 string resname(pRes->GetName()), atomname(pRes->GetAtomID(atom)); 1128 if (atomname.empty()) 1129 { 1130 snprintf(buffer, BUFF_SIZE, "%s%lu", OBElements::GetSymbol(atom->GetAtomicNum()), (unsigned long)site_id); 1131 atomname.assign(buffer); 1132 } 1133 if (resname.empty()) 1134 resname.assign("UNK"); 1135 ofs << '\t' << atomname << '\t' << resname << '\t' << pRes->GetChainNum() << '\t' << pRes->GetNum() << endl; 1136 } 1137 ofs << '\t' << atom->GetX() << '\t' << atom->GetY() << '\t' << atom->GetZ() << endl; 1138 } 1139 ofs << endl; 1140 if (pmol->HasData(OBGenericDataType::UnitCell)) 1141 { 1142 OBUnitCell * pCell = (OBUnitCell * )pmol->GetData(OBGenericDataType::UnitCell); 1143 ofs << "##########" << endl; 1144 ofs << "## CELL ##" << endl; 1145 ofs << "##########" << endl; 1146 ofs << endl; 1147 ofs << "_cell.entry_id\t" << id << endl; 1148 ofs << "_cell.length_a\t" << pCell->GetA() << endl; 1149 ofs << "_cell.length_b\t" << pCell->GetB() << endl; 1150 ofs << "_cell.length_c\t" << pCell->GetC() << endl; 1151 ofs << "_cell.angle_alpha\t" << pCell->GetAlpha() << endl; 1152 ofs << "_cell.angle_beta\t" << pCell->GetBeta() << endl; 1153 ofs << "_cell.angle_gamma\t" << pCell->GetGamma() << endl; 1154 ofs << endl; 1155 const SpaceGroup * pSG = pCell->GetSpaceGroup(); 1156 if (pSG) 1157 { 1158 ofs << "#################" << endl; 1159 ofs << "## SPACE GROUP ##" << endl; 1160 ofs << "#################" << endl; 1161 ofs << endl; 1162 ofs << "_space_group.id\t" << id << endl; 1163 bool loop_transforms = false; 1164 if (pSG->GetId()) 1165 { 1166 ofs << "_space_group.IT_number\t" << pSG->GetId() << endl; 1167 } 1168 if (! (pSG->GetHallName().empty())) 1169 { 1170 loop_transforms = true; 1171 ofs << "_space_group.name_Hall\t'" << pSG->GetHallName() << "'" << endl; 1172 } 1173 if (! (pSG->GetHMName().empty())) 1174 { 1175 loop_transforms = true; 1176 // Do we have an extended HM symbol, with origin choice as ":1" or ":2" ? If so, remove it. 1177 size_t n=pSG->GetHMName().find(":"); 1178 if(n==string::npos) 1179 ofs << "_space_group_name_H-M_alt '" << pSG->GetHMName() << "'" << endl; 1180 else 1181 ofs << "_space_group_name_H-M_alt '" << pSG->GetHMName().substr(0,n) << "'" << endl; 1182 } 1183 ofs << endl; 1184 1185 transform3dIterator transx; 1186 const transform3d * trans = pSG->BeginTransform(transx); 1187 if (trans) 1188 { 1189 ofs << "####################" << endl; 1190 ofs << "## SYMMETRY EQUIV ##" << endl; 1191 ofs << "####################" << endl; 1192 ofs << endl; 1193 size_t symid = 1; 1194 if (!loop_transforms) 1195 { 1196 ofs << "_symmetry_equiv.id\t" << symid << endl; 1197 ofs << "_symmetry_equiv.pos_as_xyz\t'" << trans->DescribeAsString() << "'" << endl; 1198 } 1199 else 1200 { 1201 ofs << "loop_" << endl; 1202 ofs << "_symmetry_equiv.id" << endl; 1203 ofs << "_symmetry_equiv.pos_as_xyz" << endl; 1204 while (trans) 1205 { 1206 ofs << '\t' << symid << "\t'" << trans->DescribeAsString() << "'" << endl; 1207 trans = pSG->NextTransform(transx); 1208 } 1209 } 1210 ofs << endl; 1211 } 1212 } 1213 } 1214 1215 return true; 1216 } 1217 1218 } //namespace OpenBabel 1219