1 /**********************************************************************
2 mmcifformat.cpp - Conversion to and from mmCIF format.
3 Copyright (C) Scarlet Line 2007
4 
5 This file is part of the Open Babel project.
6 For more information, see <http://openbabel.org/>
7 
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation version 2 of the License.
11 
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16 ***********************************************************************/
17 
18 #include <openbabel/babelconfig.h>
19 #include <openbabel/obmolecformat.h>
20 #include <openbabel/mol.h>
21 #include <openbabel/atom.h>
22 #include <openbabel/elements.h>
23 #include <openbabel/generic.h>
24 
25 #include <openbabel/op.h>
26 
27 #include <iostream>
28 #include <algorithm>
29 #include <ctype.h>
30 
31 using namespace std;
32 namespace OpenBabel
33 {
34  static const string UNKNOWN_VALUE = "?";
35 
36  class mmCIFFormat : public OBMoleculeFormat
37  {
38  public:
39    //Register this format type ID
mmCIFFormat()40    mmCIFFormat()
41    { // Copied from the Chemical MIME Page at http://www.ch.ic.ac.uk/chemime/
42      OBConversion::RegisterFormat("mcif", this, "chemical/x-mmcif");
43      OBConversion::RegisterFormat("mmcif", this, "chemical/x-mmcif");
44      // Uncomment the following line, and this file will handle all CIF formats
45      // OBConversion::RegisterFormat("cif", this, "chemical/x-cif");
46 
47      OBConversion::RegisterOptionParam("s", this);
48      OBConversion::RegisterOptionParam("p", this);
49      OBConversion::RegisterOptionParam("b", this);
50      OBConversion::RegisterOptionParam("w", this);
51    }
52 
Description()53    virtual const char* Description() //required
54    {
55      return
56        "Macromolecular Crystallographic Info\n "
57        "Read Options e.g. -as\n"
58        "  s  Output single bonds only\n"
59        "  p  Apply periodic boundary conditions for bonds\n"
60        "  b  Disable bonding entirely\n"
61        "  w  Wrap atomic coordinates into unit cell box\n\n";
62    };
63 
SpecificationURL()64    virtual const char* SpecificationURL()
65    { return "http://mmcif.pdb.org/";}; //optional
66    // CIF itself is at http://www.iucr.org/iucr-top/cif/index.html
67 
GetMIMEType()68    virtual const char* GetMIMEType()
69    { return "chemical/x-mmcif"; };
70 
71    //*** This section identical for most OBMol conversions ***
72    ////////////////////////////////////////////////////
73    /// The "API" interface functions
74    virtual int SkipObjects(int n, OBConversion* pConv);
75    virtual bool ReadMolecule(OBBase* pOb, OBConversion* pConv);
76    virtual bool WriteMolecule(OBBase* pOb, OBConversion* pConv);
77  };
78 
79  //Make an instance of the format class
80  mmCIFFormat themmCIFFormat;
81 
82  struct CIFTagID
83    {
84    enum CIFCatName
85      {
86      unread_CIFCatName,
87      atom_site,
88      cell,
89      chemical,
90      chemical_formula,
91      symmetry,
92      symmetry_equiv,
93      space_group,
94      atom_type,
95      MAX_CIFCatName
96      };
97    enum CIFDataName
98      {
99      unread_CIFDataName,
100      _atom_site_fract_x, // The x coordinate specified as a fraction of _cell_length_a
101      _atom_site_fract_y, // The y coordinate specified as a fraction of _cell_length_b
102      _atom_site_fract_z, // The z coordinate specified as a fraction of _cell_length_c
103      _atom_site_Cartn_x, // The x coordinate in angstroms
104      _atom_site_Cartn_y, // The y coordinate in angstroms
105      _atom_site_Cartn_z, // The z coordinate in angstroms
106      _atom_site_label, // The atomic label if more detailed label info unavailable
107      _atom_site_label_atom_id, // The atomic label within the residue
108      _atom_site_label_comp_id, // The residue abbreviation, e.g. ILE
109      _atom_site_label_entity_id, // The chain entity number of the residue, e.g. 2
110      _atom_site_label_asym_id, // The unique chain id
111      _atom_site_label_seq_id, // The sequence number of the residue, within the chain, e.g. 12
112      _atom_site_type_symbol, // Atomic symbol, e.g. C
113      _atom_site_occupancy,
114      MAX_atom_site,
115      _cell_length_a, // Unit-cell length a in Angstroms
116      _cell_length_b, // Unit-cell length b in Angstroms
117      _cell_length_c, // Unit-cell length c in Angstroms
118      _cell_angle_alpha, // Unit-cell angle alpha in degrees
119      _cell_angle_beta, // Unit-cell angle beta in degrees
120      _cell_angle_gamma, // Unit-cell angle gamma in degrees
121      MAX_cell,
122      // chemical name organized by increasing desirability
123      _chemical_name_common,
124      _chemical_name_structure_type,
125      _chemical_name_mineral,
126      _chemical_name_systematic,
127      MAX_chemical,
128      // chemical formulae organized by increasing desirability
129      _chemical_formula_moiety,
130      _chemical_formula_iupac,
131      _chemical_formula_structural,
132      _chemical_formula_analytical,
133      MAX_chemical_formula,
134      _symmetry_Int_Tables_number,
135      _symmetry_space_group_name_Hall,
136      _symmetry_space_group_name_H_M,
137      MAX_symmetry,
138      _symmetry_equiv_pos_as_xyz,
139      MAX_symmetry_equiv,
140      _space_group_IT_number,
141      _space_group_name_Hall,
142      _space_group_name_H_M_alt,
143      MAX_space_group,
144      _atom_type_symbol,
145      _atom_type_oxidation_number,
146      MAX_atom_type,
147      MAX_CIFDataName
148      };
149    char  tagname[76];
150    CIFDataName tagid;
151    };
152  typedef vector<CIFTagID::CIFDataName> CIFColumnList;
153  typedef map<string, CIFTagID::CIFDataName> CIFtagmap;
154  struct CIFResidueID
155    {
156    unsigned long ChainNum; // The number of the chain
157    unsigned long ResNum;  // The number of the residue within the chain
CIFResidueIDOpenBabel::CIFResidueID158    CIFResidueID()
159      {}
CIFResidueIDOpenBabel::CIFResidueID160    CIFResidueID(unsigned long c, unsigned long r)
161    :ChainNum(c), ResNum(r)
162      {}
CIFResidueIDOpenBabel::CIFResidueID163    CIFResidueID(const CIFResidueID & other)
164    :ChainNum(other.ChainNum), ResNum(other.ResNum)
165      {}
operator =OpenBabel::CIFResidueID166    CIFResidueID & operator=(const CIFResidueID & other)
167      {
168      ChainNum = other.ChainNum;
169      ResNum = other.ResNum;
170      return (* this);
171      }
operator <OpenBabel::CIFResidueID172    bool operator< (const CIFResidueID & other) const
173      {
174      return ( ChainNum < other.ChainNum ? true : ( other.ChainNum < ChainNum ? false : ResNum < other.ResNum ) );
175      }
176    };
177  typedef map<CIFResidueID, int> CIFResidueMap;
178  CIFtagmap CIFtagLookupTable;
179 
180  CIFTagID CIFTagsRead[] =
181    {
182    { "_atom_site_fract_x", CIFTagID::_atom_site_fract_x },
183    { "_atom_site_fract_y", CIFTagID::_atom_site_fract_y },
184    { "_atom_site_fract_z", CIFTagID::_atom_site_fract_z },
185    { "_atom_site_cartn_x", CIFTagID::_atom_site_Cartn_x },
186    { "_atom_site_cartn_y", CIFTagID::_atom_site_Cartn_y },
187    { "_atom_site_cartn_z", CIFTagID::_atom_site_Cartn_z },
188    { "_atom_site_type_symbol", CIFTagID::_atom_site_type_symbol },
189    { "_atom_site_occupancy", CIFTagID::_atom_site_occupancy},
190    { "_atom_site_id", CIFTagID::_atom_site_label },
191    { "_atom_site_label", CIFTagID::_atom_site_label },
192    { "_atom_site_label_atom_id", CIFTagID::_atom_site_label_atom_id },
193    { "_atom_site_label_comp_id", CIFTagID::_atom_site_label_comp_id },
194    { "_atom_site_label_entity_id", CIFTagID::_atom_site_label_entity_id },
195    { "_atom_site_label_seq_id", CIFTagID::_atom_site_label_seq_id },
196    { "_atom_site_label_asym_id", CIFTagID::_atom_site_label_asym_id },
197    { "_cell_length_a", CIFTagID::_cell_length_a },
198    { "_cell_length_b", CIFTagID::_cell_length_b },
199    { "_cell_length_c", CIFTagID::_cell_length_c },
200    { "_cell_angle_alpha", CIFTagID::_cell_angle_alpha },
201    { "_cell_angle_beta", CIFTagID::_cell_angle_beta },
202    { "_cell_angle_gamma", CIFTagID::_cell_angle_gamma },
203    { "_chemical_name_systematic", CIFTagID::_chemical_name_systematic },
204    { "_chemical_name_mineral", CIFTagID::_chemical_name_mineral },
205    { "_chemical_name_structure_type", CIFTagID::_chemical_name_structure_type },
206    { "_chemical_name_common", CIFTagID::_chemical_name_common },
207    { "_chemical_formula_analytical", CIFTagID::_chemical_formula_analytical },
208    { "_chemical_formula_structural", CIFTagID::_chemical_formula_structural },
209    { "_chemical_formula_iupac", CIFTagID::_chemical_formula_iupac },
210    { "_chemical_formula_moiety", CIFTagID::_chemical_formula_moiety },
211    { "_space_group_it_number", CIFTagID::_space_group_IT_number },
212    { "_space_group_name_hall", CIFTagID::_space_group_name_Hall },
213    { "_space_group_name_h-m_alt", CIFTagID::_space_group_name_H_M_alt },
214    { "_symmetry_int_tables_number", CIFTagID::_symmetry_Int_Tables_number },
215    { "_symmetry_space_group_name_hall", CIFTagID::_symmetry_space_group_name_Hall },
216    { "_symmetry_space_group_name_h-m", CIFTagID::_symmetry_space_group_name_H_M },
217    { "_symmetry_equiv_pos_as_xyz", CIFTagID::_symmetry_equiv_pos_as_xyz },
218    { "_space_group_symop_operation_xyz", CIFTagID::_symmetry_equiv_pos_as_xyz },
219    { "_atom_type_symbol", CIFTagID::_atom_type_symbol },
220    { "_atom_type_oxidation_number",CIFTagID::_atom_type_oxidation_number },
221    { "", CIFTagID::unread_CIFDataName }
222    };
223 
224  class CIFLexer
225  {
226  public:
227    enum TokenType
228      {
229      UnknownToken,
230      KeyDataToken,
231      KeyLoopToken,
232      KeySaveToken,
233      KeySaveEndToken,
234      KeyStopToken,
235      KeyGlobalToken,
236      TagToken,
237      ValueToken,
238      ValueOrKeyToken,
239      MAXTokenType
240      };
241    struct Token
242      {
243      TokenType type;
244      string as_text;
as_numberOpenBabel::CIFLexer::Token245      double  as_number() const
246        { return strtod(as_text.c_str(), nullptr); }
as_unsignedOpenBabel::CIFLexer::Token247      unsigned long  as_unsigned() const
248        { return strtoul(as_text.c_str(), nullptr, 10); }
249      };
CIFLexer(std::istream * in)250    CIFLexer(std::istream * in)
251    :input(in)
252      {
253      last_char = 0;
254      next_char = input->get();
255      }
256    bool next_token(CIFLexer::Token & token);
257    static CIFTagID::CIFDataName lookup_tag(const string & tag_name);
258    static CIFTagID::CIFCatName lookup_cat(CIFTagID::CIFDataName tagid);
advance()259    void advance()
260      {
261      last_char = next_char;
262      next_char = input->get();
263      }
backup(size_t count)264    void backup(size_t count)
265      {
266      for ( ++ count; count; -- count )
267        input->unget();
268      last_char = 0;
269      next_char = input->get();
270      }
backup(size_t count,char next)271    void backup(size_t count, char next)
272      {
273      for ( ; count; -- count )
274        input->unget();
275      last_char = 0;
276      next_char = next;
277      }
good() const278    bool good() const
279      { return input->good(); }
280  private:
281    istream  * input;
282    int  last_char, next_char;
283  };
lookup_tag(const string & tag_name)284  CIFTagID::CIFDataName CIFLexer::lookup_tag(const string & tag_name)
285  {
286     if (CIFtagLookupTable.empty())
287       {
288       for (size_t idx = 0; CIFTagsRead[idx].tagid != CIFTagID::unread_CIFDataName; ++ idx)
289         {
290         CIFtagLookupTable.insert(CIFtagmap::value_type(string(CIFTagsRead[idx].tagname), CIFTagsRead[idx].tagid ));
291         }
292       }
293    CIFTagID::CIFDataName rtn = CIFTagID::unread_CIFDataName;
294     CIFtagmap::const_iterator found = CIFtagLookupTable.find(tag_name);
295     if (found != CIFtagLookupTable.end())
296       rtn = (* found).second;
297     return rtn;
298  }
lookup_cat(CIFTagID::CIFDataName tagid)299  CIFTagID::CIFCatName CIFLexer::lookup_cat(CIFTagID::CIFDataName tagid)
300  {
301    CIFTagID::CIFCatName catid = CIFTagID::unread_CIFCatName;
302    if (tagid > CIFTagID::unread_CIFDataName)
303      {
304      if (tagid < CIFTagID::MAX_atom_site)
305        catid = CIFTagID::atom_site;
306      else if (tagid < CIFTagID::MAX_cell)
307        catid = CIFTagID::cell;
308      else if (tagid < CIFTagID::MAX_chemical)
309        catid = CIFTagID::chemical;
310      else if (tagid < CIFTagID::MAX_chemical_formula)
311        catid = CIFTagID::chemical_formula;
312      else if (tagid < CIFTagID::MAX_symmetry)
313        catid = CIFTagID::symmetry;
314      else if (tagid < CIFTagID::MAX_symmetry_equiv)
315        catid = CIFTagID::symmetry_equiv;
316      else if (tagid < CIFTagID::MAX_space_group)
317        catid = CIFTagID::space_group;
318      else if (tagid < CIFTagID::MAX_atom_type)
319        catid = CIFTagID::atom_type;
320      }
321    return catid;
322  }
323 
next_token(CIFLexer::Token & token)324  bool CIFLexer::next_token(CIFLexer::Token & token)
325  {
326  token.type = CIFLexer::UnknownToken;
327  token.as_text.clear();
328  while (token.type == CIFLexer::UnknownToken && input->good())
329    {
330    if (next_char <= ' ')
331      { // whitespace
332      advance();
333      }
334    else
335      { // i.e. not WhiteSpace
336      switch(next_char)
337        {
338      // Comment handling
339      case '#':
340        do // eat comment to the end of the line
341          {
342          advance();
343          } while (next_char != '\n' && input->good());
344        // We are now pointing at EOL or EOF
345        break;
346      // Tag handling
347      case '_':
348        do // read name to the next whitespace
349          {
350          if (next_char == '.') // combines DDL1 and DDL2 tag names
351            next_char = '_';
352          else
353            next_char = tolower(next_char);
354          token.as_text.push_back((char)next_char);
355          advance();
356          } while (next_char > ' ' && input->good());
357        // We are now pointing at the next whitespace
358        token.type = CIFLexer::TagToken;
359        break;
360      // Quoted data handling
361      case '"':
362        do // read name to the next quote-whitespace
363          {
364          advance();
365          if (next_char == '"')
366            {
367            while (next_char == '"')
368              {
369              advance();
370              if (next_char <= ' ') // whitespace
371                break;
372              token.as_text.push_back((char)last_char);
373              }
374            if (next_char <= ' ') // whitespace
375              break;
376            }
377          token.as_text.push_back((char)next_char);
378          } while (input->good());
379        // We are now pointing at the next whitespace
380        token.type = CIFLexer::ValueToken;
381        break;
382      case '\'':
383        do // read name to the next quote-whitespace
384          {
385          advance();
386          if (next_char == '\'')
387            {
388            while (next_char == '\'')
389              {
390              advance();
391              if (next_char <= ' ') // whitespace
392                break;
393              token.as_text.push_back((char)last_char);
394              }
395            if (next_char <= ' ') // whitespace
396              break;
397            }
398          token.as_text.push_back((char)next_char);
399          } while (input->good());
400        // We are now pointing at the next whitespace
401        token.type = CIFLexer::ValueToken;
402        break;
403      case ';':
404        if (last_char == '\n')
405          {
406          do // read name to the next <eol>-;
407            {
408            advance();
409            if (next_char == '\n')
410              {
411              while (next_char == '\n')
412                {
413                advance();
414                if (next_char == ';') // end
415                  break;
416                token.as_text.push_back((char)last_char);
417                }
418              if (next_char == ';') // end
419                {
420                advance(); // go past the end
421                break;
422                }
423              }
424            token.as_text.push_back((char)next_char);
425            } while (input->good());
426          // We are now pointing at the next whitespace
427          token.type = CIFLexer::ValueToken;
428          break;
429          }
430        // drop through to the default case
431      default: // reading an un-quoted text string
432        do // read text to the next whitespace
433          {
434          token.as_text.push_back((char)next_char);
435          advance();
436          } while (next_char > ' ' && input->good());
437        token.type = CIFLexer::ValueOrKeyToken;
438        // We are now pointing at the next whitespace
439        break;
440        }
441      }
442    }
443  if (token.type == CIFLexer::ValueOrKeyToken)
444    {
445    string::size_type len = token.as_text.size();
446    if (len == 1 && token.as_text[0] == '.')
447      token.type = CIFLexer::ValueToken;
448    else if (!strncasecmp(token.as_text.c_str(), "data_", 5))
449      {
450      token.type = CIFLexer::KeyDataToken;
451      token.as_text.erase(0, 5);
452      }
453    else if (!strcasecmp(token.as_text.c_str(), "loop_"))
454      token.type = CIFLexer::KeyLoopToken;
455    else if (!strncasecmp(token.as_text.c_str(), "save_", 5))
456      {
457      if (len == 5)
458        {
459        token.type = CIFLexer::KeySaveEndToken;
460        }
461      else
462        {
463        token.type = CIFLexer::KeySaveToken;
464        token.as_text.erase(0, 5);
465        }
466      }
467    else if (!strcasecmp(token.as_text.c_str(), "stop_"))
468      token.type = CIFLexer::KeyStopToken;
469    else if (!strcasecmp(token.as_text.c_str(), "global_"))
470      token.type = CIFLexer::KeyGlobalToken;
471    else
472      token.type = CIFLexer::ValueToken;
473    }
474  return token.type != CIFLexer::UnknownToken;
475  }
476  /////////////////////////////////////////////////////////////////
SkipObjects(int n,OBConversion * pConv)477   int mmCIFFormat::SkipObjects(int n, OBConversion* pConv)
478  {
479    if (n == 0)
480      ++ n;
481    CIFLexer lexer(pConv->GetInStream());
482    CIFLexer::Token token;
483    while (n && lexer.good())
484      {
485      while ( lexer.next_token(token) && token.type != CIFLexer::KeyDataToken);
486      -- n;
487      }
488    if (lexer.good())
489      lexer.backup(5 + token.as_text.size(), 'd'); // length of "data_<name>"
490 
491    return lexer.good() ? 1 : -1;
492  }
ReadMolecule(OBBase * pOb,OBConversion * pConv)493  bool mmCIFFormat::ReadMolecule(OBBase* pOb, OBConversion* pConv)
494  {
495    OBMol* pmol = pOb->CastAndClear<OBMol>();
496    if (pmol == nullptr)
497      return false;
498 
499    CIFLexer lexer(pConv->GetInStream());
500    CIFLexer::Token token;
501 
502    typedef map<string, unsigned> CIFasymmap;
503    CIFasymmap asym_map;
504    string last_asym_id = "";
505    unsigned next_asym_no = 0;
506    bool has_residue_information = false;
507 
508    pmol->SetChainsPerceived(); // avoid perception if we are setting residues
509 
510    bool wrap_coords = pConv->IsOption("w",OBConversion::INOPTIONS);
511 
512    // move to the next data block (i.e. molecule, we hope )
513    while (lexer.next_token(token) && token.type != CIFLexer::KeyDataToken);
514    if (token.type == CIFLexer::KeyDataToken)
515      { // we have found the next data block:
516      pmol->BeginModify();
517      pmol->SetTitle(token.as_text);
518      bool finished = false, token_peeked = false;
519      double cell_a = 1.0, cell_b = 1.0, cell_c = 1.0;
520      double cell_alpha = 90.0, cell_beta = 90.0, cell_gamma = 90.0;
521      CIFTagID::CIFDataName name_tag = CIFTagID::unread_CIFDataName;
522      CIFTagID::CIFDataName formula_tag = CIFTagID::unread_CIFDataName;
523      int use_cell = 0, use_fract = 0;
524      string space_group_name("P1");
525      SpaceGroup space_group;
526      bool space_group_failed = false;
527      std::map<string, double> atomic_charges;
528      while (!finished && (token_peeked || lexer.next_token(token)))
529        {
530        token_peeked = false;
531        switch (token.type)
532          {
533        case CIFLexer::KeyGlobalToken:
534          // We have come to the next block:
535          if (pmol->NumAtoms() > 0)
536            { // Found a molecule, so finished
537            finished = true;
538            // move back to the start of the global block:
539            lexer.backup(token.as_text.size(), 'g'); // length of "global_"
540            }
541          else // not yet found a molecule, so go to the next data block
542            {
543            while (lexer.next_token(token) && token.type != CIFLexer::KeyDataToken);
544            if (token.type == CIFLexer::KeyDataToken)
545              { // we have found the next data block:
546              pmol->SetTitle(token.as_text);
547              }
548            }
549          break;
550        case CIFLexer::KeyDataToken:
551          // We have come to the next data block:
552          if (pmol->NumAtoms() > 0)
553            { // Found a molecule, so finished
554            finished = true;
555            // move back to the start of the data block:
556            lexer.backup(5 + token.as_text.size(), 'd'); // length of "data_<name>"
557            }
558          else // not yet found a molecule, so try again
559            pmol->SetTitle(token.as_text);
560          break;
561        case CIFLexer::KeySaveToken:
562          { // Simply eat tokens until the save_ ending token
563          while (lexer.next_token(token) && token.type != CIFLexer::KeySaveEndToken);
564          }
565          break;
566        case CIFLexer::KeyLoopToken:
567          {
568          CIFColumnList  columns;
569          CIFTagID::CIFCatName catid = CIFTagID::unread_CIFCatName;
570          while ( (token_peeked = lexer.next_token(token)) == true && token.type == CIFLexer::TagToken)
571            { // Read in the tags
572            CIFTagID::CIFDataName tagid = lexer.lookup_tag(token.as_text);
573            columns.push_back(tagid);
574            if (catid == CIFTagID::unread_CIFCatName && tagid != CIFTagID::unread_CIFDataName)
575              catid = lexer.lookup_cat(tagid);
576            }
577          size_t column_count = columns.size();
578          switch (catid)
579            {
580          case CIFTagID::atom_site:
581            {
582            int use_cartn = 0, use_residue = 0;
583            use_fract = 0;
584            CIFTagID::CIFDataName atom_type_tag = CIFTagID::unread_CIFDataName;
585            for (CIFColumnList::const_iterator colx = columns.begin(), coly = columns.end(); colx != coly; ++ colx)
586              {
587              switch (* colx)
588                {
589              case CIFTagID::_atom_site_Cartn_x:
590              case CIFTagID::_atom_site_Cartn_y:
591              case CIFTagID::_atom_site_Cartn_z:
592                ++ use_cartn;
593                break;
594              case CIFTagID::_atom_site_fract_x:
595              case CIFTagID::_atom_site_fract_y:
596              case CIFTagID::_atom_site_fract_z:
597                ++ use_fract;
598                break;
599              case CIFTagID::_atom_site_label_comp_id:
600              case CIFTagID::_atom_site_label_seq_id:
601                ++ use_residue;
602                break;
603              case CIFTagID::_atom_site_type_symbol:
604              case CIFTagID::_atom_site_label_atom_id:
605              case CIFTagID::_atom_site_label:
606                if (atom_type_tag < (* colx))
607                  atom_type_tag = (* colx);
608                break;
609              default:
610                break;
611                }
612              }
613            if (use_cartn)
614              {
615              for (CIFColumnList::iterator colx = columns.begin(), coly = columns.end(); colx != coly; ++ colx)
616                if ( (* colx) >= CIFTagID::_atom_site_fract_x && (* colx) <= CIFTagID::_atom_site_fract_z)
617                  (* colx) = CIFTagID::unread_CIFDataName;
618              use_fract = 0;
619              }
620            size_t column_idx = 0;
621            OBAtom * atom = nullptr;
622            double x = 0.0, y = 0.0, z = 0.0;
623            CIFResidueMap ResidueMap;
624            unsigned long chain_num = 1, residue_num = 1;
625            unsigned int nbc=0;
626            string residue_name, atom_label, atom_mol_label, tmpSymbol;
627            int atomicNum;
628            OBPairData *label;
629            while (token.type == CIFLexer::ValueToken) // Read in the Fields
630              {
631              if (column_idx == 0)
632                {
633                atom  = pmol->NewAtom();
634                x = y = z = 0.0;
635                }
636              switch (columns[column_idx])
637                {
638              case CIFTagID::_atom_site_label: // The atomic label within the molecule
639                label = new OBPairData;
640                label->SetAttribute("_atom_site_label");
641                label->SetValue(token.as_text);
642                label->SetOrigin(fileformatInput);
643                atom->SetData(label);
644                atom_mol_label.assign(token.as_text);
645 
646                if (atom_type_tag != CIFTagID::_atom_site_label)
647                  break;
648                // Else remove everything starting from the first digit
649                // and drop through to _atom_site_type_symbol
650                if(string::npos != token.as_text.find_first_of("0123456789"))
651                  {token.as_text.erase(token.as_text.find_first_of("0123456789"), token.as_text.size());}
652              case CIFTagID::_atom_site_type_symbol:
653                // Problem: posat->mSymbol is not guaranteed to actually be a
654                // symbol see http://www.iucr.org/iucr-top/cif/cifdic_html/1/cif_core.dic/Iatom_type_symbol.html
655                // Try to strip the string to have a better chance to have a
656                // valid symbol
657                // This is not guaranteed to work still, as the CIF standard
658                // allows about any string...
659                tmpSymbol=token.as_text.c_str();
660                if ((tmpSymbol.size()==1) && isalpha(tmpSymbol[0]))
661                  {
662                  nbc=1;
663                  }
664                else if (tmpSymbol.size()>=2)
665                  {
666                  if (isalpha(tmpSymbol[0]) && isalpha(tmpSymbol[1]))
667                    {
668                    nbc=2;
669                    }
670                  else if (isalpha(tmpSymbol[0]))
671                    {
672                    nbc=1;
673                    }
674                  }
675                else
676                  {
677                  nbc = 0;
678                  }
679                if (tmpSymbol.size()>nbc)
680                  {// Try to find a formal charge in the symbol
681                  int charge=0;
682                  int sign=0;
683                  for(unsigned int i=nbc;i<tmpSymbol.size();++i)
684                    {// Use first number found as formal charge
685                    if (isdigit(tmpSymbol[i]) && (charge==0))
686                      {
687                      charge=atoi(tmpSymbol.substr(i,1).c_str());
688                      }
689                    if ('-'==tmpSymbol[i])
690                      {
691                      sign-=1;
692                      }
693                    if ('+'==tmpSymbol[i])
694                      {
695                      sign+=1;
696                      }
697                    }
698                    if (0!=sign) // no sign, no charge
699                      {
700                      if (charge==0)
701                        {
702                        charge=1;
703                        }
704                      stringstream ss;
705                      ss<< tmpSymbol <<" / symbol="<<tmpSymbol.substr(0,nbc)
706                        <<" charge= "<<sign*charge;
707                      obErrorLog.ThrowError(__FUNCTION__, ss.str(), obDebug);
708                      atom->SetFormalCharge(sign*charge);
709                      }
710                  }
711                if (nbc>0)
712                  {
713                  tmpSymbol=tmpSymbol.substr(0,nbc);
714                  }
715                else
716                  {
717                  stringstream ss;
718                  ss<< tmpSymbol <<" / could not derive a symbol"
719                    <<" for atomic number. Setting it to default "
720                    <<" Xx(atomic number 0)";
721                  obErrorLog.ThrowError(__FUNCTION__, ss.str(), obDebug);
722                  tmpSymbol="Xx";//Something went wrong, no symbol ! Default to Xx
723                  }
724                atomicNum = OBElements::GetAtomicNum(tmpSymbol.c_str());
725                // Test for some oxygens with subscripts
726                if (atomicNum == 0 && tmpSymbol[0] == 'O')
727                  {
728                  atomicNum = 8; // e.g. Ob, OH, etc.
729                  }
730 
731                atom->SetAtomicNum(atomicNum); //set atomic number, or '0' if the atom type is not recognized
732                atom->SetType(tmpSymbol);
733                break;
734              case CIFTagID::_atom_site_fract_x:
735              case CIFTagID::_atom_site_Cartn_x:
736                x = token.as_number();
737                break;
738              case CIFTagID::_atom_site_fract_y:
739              case CIFTagID::_atom_site_Cartn_y:
740                y = token.as_number();
741                break;
742              case CIFTagID::_atom_site_fract_z:
743              case CIFTagID::_atom_site_Cartn_z:
744                z = token.as_number();
745                break;
746              case CIFTagID::_atom_site_label_atom_id: // The atomic label within the residue
747                atom_label.assign(token.as_text);
748                if (atom_type_tag == CIFTagID::_atom_site_label_atom_id)
749                  {
750                  for (string::iterator posx = token.as_text.begin(), posy = token.as_text.end(); posx != posy; ++ posx)
751                    {
752                    char c = (char)toupper(* posx);
753                    if ( c < 'A' || c > 'Z' )
754                      {
755                      token.as_text.erase(posx, posy);
756                      break;
757                      }
758                    }
759                  atom->SetAtomicNum(OBElements::GetAtomicNum(token.as_text.c_str()));
760                  atom->SetType(token.as_text);
761                  }
762                break;
763              case CIFTagID::_atom_site_label_comp_id: // The residue abbreviation, e.g. ILE
764                residue_name.assign(token.as_text);
765                break;
766              case CIFTagID::_atom_site_label_entity_id: // The chain entity number of the residue, e.g. 2
767     // ignored and replaced by unique id for label_asym_id
768                break;
769              case CIFTagID::_atom_site_label_asym_id: // The strand number of the residue
770                    if (token.as_text != last_asym_id) {
771                        CIFasymmap::const_iterator asym_it = asym_map.find(token.as_text);
772                           if (asym_it == asym_map.end()) {
773                               ++next_asym_no;
774                               asym_it =
775                                   asym_map.insert(CIFasymmap::value_type(token.as_text,
776                                                                          next_asym_no)).first;
777                           }
778                           chain_num = asym_it->second;
779                           last_asym_id = token.as_text;
780                }
781                break;
782              case CIFTagID::_atom_site_label_seq_id: // The sequence number of the residue, within the chain, e.g. 12
783                residue_num = token.as_unsigned();
784                break;
785              case CIFTagID::_atom_site_occupancy: // The occupancy of the site.
786                {
787                  OBPairFloatingPoint * occup = new OBPairFloatingPoint;
788                  occup->SetAttribute("_atom_site_occupancy");
789                  double occupancy = std::max(0.0, std::min(1.0, token.as_number())); // clamp occupancy to [0.0, 1.0] bugfix
790                  occup->SetValue(occupancy);
791                  occup->SetOrigin(fileformatInput);
792                  atom->SetData(occup);
793                }
794                break;
795              case CIFTagID::unread_CIFDataName:
796              default:
797                break;
798                }
799              ++ column_idx;
800              if (column_idx == column_count)
801                {
802                atom->SetVector(x, y, z);
803                if (use_residue == 2)
804                  {
805                  has_residue_information = true;
806                  CIFResidueID res_id(chain_num, residue_num);
807                  CIFResidueMap::const_iterator resx = ResidueMap.find(res_id);
808                  OBResidue * res;
809                  if (resx == ResidueMap.end())
810                    {
811                    ResidueMap[res_id] = pmol->NumResidues();
812                    res  = pmol->NewResidue();
813                    res->SetChainNum(chain_num);
814                    res->SetNum(residue_num);
815                    res->SetName(residue_name);
816                    }
817                  else
818                    res = pmol->GetResidue( (* resx).second );
819                  res->AddAtom(atom);
820                  if (!atom_label.empty())
821                    res->SetAtomID(atom, atom_label);
822                  unsigned long serial_no = strtoul(atom_mol_label.c_str(), nullptr, 10);
823                  if (serial_no > 0)
824                    res->SetSerialNum(atom, serial_no);
825                  }
826                column_idx = 0;
827                }
828              token_peeked = lexer.next_token(token);
829              }
830            }
831            break;
832          case CIFTagID::symmetry_equiv:
833            {
834            size_t column_idx = 0;
835            while (token.type == CIFLexer::ValueToken) // Read in the Fields
836              {
837              if ((columns[column_idx] == CIFTagID::_symmetry_equiv_pos_as_xyz)
838                && token.as_text.find(UNKNOWN_VALUE) == string::npos)
839                space_group.AddTransform(token.as_text);
840              ++ column_idx;
841              if (column_idx == column_count)
842                column_idx = 0;
843              token_peeked = lexer.next_token(token);
844              }
845            }
846            break;
847 
848          case CIFTagID::atom_type: //Atoms oxidations
849            {
850            size_t column_idx = 0;
851            string atom_label = "";
852            double charge = 0;
853            while (token.type == CIFLexer::ValueToken) // Read in the Fields
854              {
855              if (columns[column_idx] == CIFTagID::_atom_type_symbol)
856                atom_label = token.as_text;
857              if (columns[column_idx] == CIFTagID::_atom_type_oxidation_number)
858                charge = token.as_number();
859              ++ column_idx;
860              if (column_idx == column_count)
861              {
862                atomic_charges[atom_label] = charge;
863                column_idx = 0;
864              }
865              token_peeked = lexer.next_token(token);
866              }
867            }
868            break;
869 
870 
871          case CIFTagID::unread_CIFCatName:
872          default:
873            while (token.type == CIFLexer::ValueToken) // Eat the values, we don't want them
874              token_peeked = lexer.next_token(token);
875            break;
876            }
877          }
878          break;
879        case CIFLexer::TagToken:
880          {
881          CIFTagID::CIFDataName tag_id = lexer.lookup_tag(token.as_text);
882          // get the value
883          lexer.next_token(token);
884          switch (tag_id)
885            {
886          case CIFTagID::_cell_length_a:
887            cell_a = token.as_number();
888            ++ use_cell;
889            break;
890          case CIFTagID::_cell_length_b:
891            cell_b = token.as_number();
892            ++ use_cell;
893            break;
894          case CIFTagID::_cell_length_c:
895            cell_c = token.as_number();
896            ++ use_cell;
897            break;
898          case CIFTagID::_cell_angle_alpha:
899            cell_alpha = token.as_number();
900            ++ use_cell;
901            break;
902          case CIFTagID::_cell_angle_beta:
903            cell_beta = token.as_number();
904            ++ use_cell;
905            break;
906          case CIFTagID::_cell_angle_gamma:
907            cell_gamma = token.as_number();
908            ++ use_cell;
909            break;
910          case CIFTagID::_chemical_name_systematic:
911          case CIFTagID::_chemical_name_mineral:
912          case CIFTagID::_chemical_name_structure_type:
913          case CIFTagID::_chemical_name_common:
914            if (tag_id > name_tag)
915              {
916              name_tag = tag_id;
917              pmol->SetTitle(token.as_text);
918              }
919            break;
920          case CIFTagID::_chemical_formula_analytical:
921          case CIFTagID::_chemical_formula_structural:
922          case CIFTagID::_chemical_formula_iupac:
923          case CIFTagID::_chemical_formula_moiety:
924            if (tag_id > formula_tag)
925              {
926              formula_tag = tag_id;
927              pmol->SetFormula(token.as_text);
928              }
929            break;
930          case CIFTagID::_space_group_IT_number:
931          case CIFTagID::_symmetry_Int_Tables_number:
932            space_group_name.assign(token.as_text);
933            space_group.SetId(atoi(space_group_name.c_str()));
934            break;
935          case CIFTagID::_space_group_name_Hall:
936          case CIFTagID::_symmetry_space_group_name_Hall:
937            space_group_name.assign(token.as_text);
938            space_group.SetHallName(space_group_name.c_str());
939            break;
940          case CIFTagID::_space_group_name_H_M_alt:
941          case CIFTagID::_symmetry_space_group_name_H_M:
942            space_group_name.assign(token.as_text);
943            space_group.SetHMName(space_group_name.c_str());
944            break;
945          case CIFTagID::_symmetry_equiv_pos_as_xyz:
946            space_group.AddTransform(token.as_text);
947            break;
948          default: // eat the value for this tag
949            break;
950            }
951          }
952          break;
953        case CIFLexer::KeyStopToken:
954        case CIFLexer::ValueToken:
955        default:
956          break;
957          }
958        }
959      if (pmol->NumAtoms() > 0)
960        {
961        if (use_cell >= 6)
962          {
963          OBUnitCell * pCell = new OBUnitCell;  // No matching "delete" because it's saved in pmol->SetData
964          pCell->SetOrigin(fileformatInput);
965          pCell->SetData(cell_a, cell_b, cell_c,
966                         cell_alpha,
967                         cell_beta,
968                         cell_gamma
969                         );
970          pCell->SetSpaceGroup(space_group_name);
971          const SpaceGroup * pSpaceGroup = SpaceGroup::Find( & space_group);
972          if (pSpaceGroup)
973            pCell->SetSpaceGroup(pSpaceGroup);
974          else
975            space_group_failed = true;
976          pmol->SetData(pCell);
977          if (use_fract)
978            {
979            for (OBAtomIterator atom_x = pmol->BeginAtoms(), atom_y = pmol->EndAtoms(); atom_x != atom_y; ++ atom_x)
980              {
981              OBAtom * atom = (* atom_x);
982              if (wrap_coords)
983                atom->SetVector(pCell->FractionalToCartesian(
984                                pCell->WrapFractionalCoordinate(atom->GetVector())));
985              else
986                atom->SetVector(pCell->FractionalToCartesian(atom->GetVector()));
987              }  // Note: this is where we could keep the original fractional coordinates, e.g. in a new OBCoord class
988            }
989          if (pConv->IsOption("p",OBConversion::INOPTIONS))
990            pmol->SetPeriodicMol();
991          }
992        for (OBAtomIterator atom_x = pmol->BeginAtoms(), atom_y = pmol->EndAtoms(); atom_x != atom_y; ++atom_x )
993        {
994          OBAtom * atom = (* atom_x);
995          OBPairData * pd = dynamic_cast<OBPairData *>( atom->GetData( "_atom_site_label" ) );
996          if (pd != nullptr)
997          {
998            if( atomic_charges.count( pd->GetValue() ) > 0 )
999            {
1000                OBPairFloatingPoint * charge_obd = new OBPairFloatingPoint;
1001                charge_obd->SetAttribute("input_charge");
1002                charge_obd->SetValue(atomic_charges[pd->GetValue()] );
1003                charge_obd->SetOrigin(fileformatInput);
1004                atom->SetData(charge_obd);
1005            }
1006          }
1007        }
1008 
1009        if (!pConv->IsOption("b",OBConversion::INOPTIONS))
1010          {
1011          pmol->ConnectTheDots();
1012          if (!pConv->IsOption("s",OBConversion::INOPTIONS))
1013            pmol->PerceiveBondOrders();
1014          }
1015        }
1016 
1017        if (space_group_failed)
1018        {
1019          string transformations;
1020          transform3dIterator ti;
1021          const transform3d *t = space_group.BeginTransform(ti);
1022          while(t){
1023            transformations += t->DescribeAsString() + " ";
1024            t = space_group.NextTransform(ti);
1025          }
1026 
1027          OBOp* pOp = OBOp::FindType("fillUC");
1028          if (pOp && transformations.length())
1029          {
1030            map<string, string> m;
1031            m.insert(pair<string, string>("transformations", transformations));
1032            pOp->Do(pmol, "strict", &m);
1033          }
1034        }
1035 
1036      pmol->EndModify();
1037      }
1038    if (has_residue_information)
1039      pmol->SetChainsPerceived();
1040    return (pmol->NumAtoms() > 0 ? true : false);
1041  }
1042 
1043  ////////////////////////////////////////////////////////////////
1044 
WriteMolecule(OBBase * pOb,OBConversion * pConv)1045  bool mmCIFFormat::WriteMolecule(OBBase* pOb, OBConversion* pConv)
1046  {
1047    OBMol* pmol = dynamic_cast<OBMol*>(pOb);
1048    if (pmol == nullptr)
1049      return false;
1050 
1051    //Define some references so we can use the old parameter names
1052    ostream & ofs = * pConv->GetOutStream();
1053 
1054    char buffer[BUFF_SIZE];
1055 
1056    string id;
1057    for (const char * p = pmol->GetTitle(); * p; ++ p)
1058      if ( (* p) > ' ' && (* p) <= '~' )
1059        id.append(1, (char)toupper(* p));
1060    if (id.empty())
1061      {
1062      snprintf(buffer, BUFF_SIZE, "T%lu", (unsigned long)time(nullptr));
1063      id.assign(buffer);
1064      }
1065    ofs << "# --------------------------------------------------------------------------" << endl;
1066    ofs << "#" << endl;
1067    ofs << "# CIF file generated by openbabel " << BABEL_VERSION << " http://openbabel.org/" << endl;
1068    ofs << "# to comply with the Macromolecular CIF Dictionary  (cif_mm.dic) version  2.0.11 http://mmcif.pdb.org/" << endl;
1069    ofs << "# The contents of this file were derived from " << pConv->GetInFilename() << endl;
1070    ofs << "#" << endl;
1071    ofs << "#---------------------------------------------------------------------------" << endl;
1072    ofs << endl;
1073    ofs << "data_" << id << endl;
1074    ofs << endl;
1075    ofs << "###########" << endl;
1076    ofs << "## ENTRY ##" << endl;
1077    ofs << "###########" << endl;
1078    ofs << endl;
1079    ofs << "_entry.id\t" << id << endl;
1080    ofs << endl;
1081    if (* (pmol->GetTitle()))
1082      {
1083      ofs << "##############" << endl;
1084      ofs << "## CHEMICAL ##" << endl;
1085      ofs << "##############" << endl;
1086      ofs << endl;
1087      ofs << "_chemical.entry_id\t" << id << endl;
1088      ofs << "_chemical.name_common\t'" << pmol->GetTitle() << "'" << endl;
1089      ofs << endl;
1090      }
1091    if (! (pmol->GetSpacedFormula().empty()))
1092      {
1093      ofs << "######################" << endl;
1094      ofs << "## CHEMICAL FORMULA ##" << endl;
1095      ofs << "######################" << endl;
1096      ofs << endl;
1097      ofs << "_chemical_formula.entry_id\t" << id << endl;
1098      ofs << "_chemical_formula.structural\t'" << pmol->GetFormula() << "'" << endl;
1099      ofs << endl;
1100      }
1101    ofs << "###############" << endl;
1102    ofs << "## ATOM_SITE ##" << endl;
1103    ofs << "###############" << endl;
1104    ofs << endl;
1105    ofs << "loop_" << endl;
1106    ofs << "_atom_site.id" << endl;
1107    ofs << "_atom_site.type_symbol" << endl;
1108    bool has_residues = (pmol->NumResidues() > 0);
1109    if (has_residues)
1110      {
1111      ofs << "_atom_site.label_atom_id" << endl;
1112      ofs << "_atom_site.label_comp_id" << endl;
1113      ofs << "_atom_site.label_entity_id" << endl;
1114      ofs << "_atom_site.label_seq_id" << endl;
1115      }
1116    ofs << "_atom_site.Cartn_x" << endl;
1117    ofs << "_atom_site.Cartn_y" << endl;
1118    ofs << "_atom_site.Cartn_z" << endl;
1119    size_t site_id = 1;
1120    for (OBAtomIterator atom_x = pmol->BeginAtoms(), atom_y = pmol->EndAtoms(); atom_x != atom_y; ++ atom_x, ++ site_id)
1121      {
1122      OBAtom * atom = (* atom_x);
1123      ofs << '\t' << site_id << '\t' << OBElements::GetSymbol(atom->GetAtomicNum());
1124      if (has_residues)
1125        {
1126        OBResidue * pRes = atom->GetResidue();
1127        string resname(pRes->GetName()), atomname(pRes->GetAtomID(atom));
1128        if (atomname.empty())
1129          {
1130          snprintf(buffer, BUFF_SIZE, "%s%lu", OBElements::GetSymbol(atom->GetAtomicNum()), (unsigned long)site_id);
1131          atomname.assign(buffer);
1132          }
1133        if (resname.empty())
1134          resname.assign("UNK");
1135        ofs << '\t' << atomname << '\t' << resname << '\t' << pRes->GetChainNum() << '\t' << pRes->GetNum() << endl;
1136        }
1137      ofs << '\t' << atom->GetX() << '\t' << atom->GetY() << '\t' << atom->GetZ() << endl;
1138      }
1139    ofs << endl;
1140    if (pmol->HasData(OBGenericDataType::UnitCell))
1141      {
1142      OBUnitCell * pCell = (OBUnitCell * )pmol->GetData(OBGenericDataType::UnitCell);
1143      ofs << "##########" << endl;
1144      ofs << "## CELL ##" << endl;
1145      ofs << "##########" << endl;
1146      ofs << endl;
1147      ofs << "_cell.entry_id\t" << id << endl;
1148      ofs << "_cell.length_a\t" << pCell->GetA() << endl;
1149      ofs << "_cell.length_b\t" << pCell->GetB() << endl;
1150      ofs << "_cell.length_c\t" << pCell->GetC() << endl;
1151      ofs << "_cell.angle_alpha\t" << pCell->GetAlpha() << endl;
1152      ofs << "_cell.angle_beta\t"  << pCell->GetBeta() << endl;
1153      ofs << "_cell.angle_gamma\t" << pCell->GetGamma() << endl;
1154      ofs << endl;
1155      const SpaceGroup * pSG = pCell->GetSpaceGroup();
1156      if (pSG)
1157        {
1158        ofs << "#################" << endl;
1159        ofs << "## SPACE GROUP ##" << endl;
1160        ofs << "#################" << endl;
1161        ofs << endl;
1162        ofs << "_space_group.id\t" << id << endl;
1163        bool loop_transforms = false;
1164        if (pSG->GetId())
1165          {
1166          ofs << "_space_group.IT_number\t" << pSG->GetId() << endl;
1167          }
1168        if (! (pSG->GetHallName().empty()))
1169          {
1170          loop_transforms = true;
1171          ofs << "_space_group.name_Hall\t'" << pSG->GetHallName() << "'" << endl;
1172          }
1173        if (! (pSG->GetHMName().empty()))
1174          {
1175          loop_transforms = true;
1176          // Do we have an extended HM symbol, with origin choice as ":1" or ":2" ? If so, remove it.
1177          size_t n=pSG->GetHMName().find(":");
1178          if(n==string::npos)
1179            ofs << "_space_group_name_H-M_alt '" << pSG->GetHMName() << "'" << endl;
1180          else
1181            ofs << "_space_group_name_H-M_alt '" << pSG->GetHMName().substr(0,n) << "'" << endl;
1182          }
1183        ofs << endl;
1184 
1185        transform3dIterator transx;
1186        const transform3d * trans = pSG->BeginTransform(transx);
1187        if (trans)
1188          {
1189          ofs << "####################" << endl;
1190          ofs << "## SYMMETRY EQUIV ##" << endl;
1191          ofs << "####################" << endl;
1192          ofs << endl;
1193          size_t symid = 1;
1194          if (!loop_transforms)
1195            {
1196            ofs << "_symmetry_equiv.id\t" << symid << endl;
1197            ofs << "_symmetry_equiv.pos_as_xyz\t'" << trans->DescribeAsString() << "'" << endl;
1198            }
1199          else
1200            {
1201            ofs << "loop_" << endl;
1202            ofs << "_symmetry_equiv.id" << endl;
1203            ofs << "_symmetry_equiv.pos_as_xyz" << endl;
1204            while (trans)
1205              {
1206              ofs << '\t' << symid << "\t'" << trans->DescribeAsString() << "'" << endl;
1207              trans = pSG->NextTransform(transx);
1208              }
1209            }
1210          ofs << endl;
1211          }
1212        }
1213      }
1214 
1215    return true;
1216  }
1217 
1218 } //namespace OpenBabel
1219