1 /**********************************************************************
2 Copyright (C) 2006 Chris Morley
3 
4 This file is part of the Open Babel project.
5 For more information, see <http://openbabel.org/>
6 
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation version 2 of the License.
10 
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 ***********************************************************************/
16 
17 #include <openbabel/babelconfig.h>
18 #include <fstream>
19 #include <string>
20 
21 // This macro is used in DLL builds. If it has not
22 // been set in babelconfig.h, define it as nothing.
23 #ifndef OBCOMMON
24   #define OBCOMMON
25 #endif
26 
27 using namespace std;
28 namespace OpenBabel
29 {
30 
31 ///Returns true if character is not one used in an InChI.
isnic(char ch)32 bool isnic(char ch)
33 {
34   //This set of characters could be extended
35   static std::string nic("\"\'\\@<>!$%&{}[]");
36   return ch<0 || nic.find(ch)!=std::string::npos;
37 };
38 
39 /// @brief Reads an InChI (possibly split) from an input stream and returns it as unsplit text.
40 /// The input stream is left after the end of the extracted InChI ready to look for the next one.
41 std::string GetInChI(std::istream& is);
42 
43 /*!
44 This function recovers a normal InChI from an input stream which
45 contains other arbitrary text. The InChI string can have
46 extraneous characters inserted, for example because of word wrapping,
47 provided it follows certain rules.
48 
49 When this file (getinchi.cpp) is read, 15 InChIs will be extracted, e.g.
50  babel -iinchi getinchi.cpp -osmi
51 
52 Inside an InChI string ignore anything between < and >
53 This means that an InChI string can be split up by inserting any number of <br /> elements:
54 InChI=1/C18H25NO6S/c1-14-9-11-15(12-10-14)26(22,23)19(17(21)25-18(2,3)4)13<br />-7-6-8-16(20)24-5/h6,8-12H,7,13H2,1-5H3/b8-6-
55 
56 Any whitespace after the > is also ignored, so that newline characters can be added:
57 InChI=1/C29H33NO4Si/c1-5-32-28(31)26-25(34-27(30-26)22-15-9-6-10-16-22)<br />
58 21-33-35(29(2,3)4,23-17-11-7-12-18-23)24-19-13-8-14-20-24<br />
59 /h6-20,25-26H,5,21H2,1-4H3/t25-,26-/m0/s1
60 
61 A second consecutive <...> element ends an unquoted InChI string:
62 <p>
63 <small>InChI=1/C47H58N2O10SSi/c1-10-56-43(51)47(36(32-41(50)55-9)30-31-49(44(52)59-45<br />
64 (3,4)5)60(53,54)37-28-26-34(2)27-29-37)40<br />
65 (58-42(48-47)35-20-14-11-15-21-35)33-57-61(46(6,7)8,38-22-16-12-17-23-38)39-24-<br />
66 18-13-19-25-39/h11-29,36,40H,10,30-33H2,1-9H3<br />
67 /t36-,40-,47-/m0/s1</small>
68 </p>
69 
70   Dmitrii Tchekhovskoi made a proposal for "InChI hyphenation" or "quoted InChI".
71   http://sourceforge.net/mailarchive/forum.php?thread_id=10200459&forum_id=45166
72   This proposal has not been followed up probably because InChKey was introduced.
73 
74   However this function GetInChI() parses quoted InChIs of this form.
75   It also extends this proposal, allowing a wider range of corrupted InChIs to be accepted.
76 
77 The original proposal was essentially:
78 - When an InChI string is enclosed by " quote characters,
79   any whitespace characters it contains (including new lines) are
80   ignored.
81 - Other extraneous strings can also be ignored, but this
82   is system dependent.
83 - The "InChI=" cannot be split.
84 
85 The extensions are:
86 - The character that encloses a quoted InChI does not have to be "
87   and can be any character that is not used in InChI - a NIC
88   [never miss the opportunity for a TLA!]. This means that
89   conflicts in systems which have other uses for the quote character
90   can be avoided.
91   As a special case, '>' is not allowed as a quote character because InChI
92   strings in HTML commonly start after <...> elements.
93 - As well as whitespace characters (which are ignored), a quoted
94   InChI can contain an extraneous string which starts and ends with
95   a NIC. This allows inserted strings like <br /> to be ignored.
96   However, only one such extraneous string is allowed.
97 - There are no restrictions on splitting "InChI=" by whitespace
98   characters, allowing a minimum column width of 1.
99   If the splitting were by an extraneous string the minimum column
100   width is 2.
101 
102 The following are some examples of split InChIs.
103 
104 First two unbroken examples, the first is unquoted
105 InChI=1/CH4/h1H4 methane
106 "InChI=1/C4H10O/c1-3-5-4-2/h3-4H2,1-2H3" diethyl ether
107 
108 Multiple white space splitting
109 @InChI=1/C15H14O3/c1-11(15(16)17)18-
110 14-10-6-5-9-13(14)12-7-3-2-4-8-12/h2
111 -11H,1H3,(H,16,17)@
112 
113 Split with extraneous text, which starts and ends with a non-InChI character
114 'InChI=1/C2H6O/c1-2-<br />3/h3H,2H2,1H3'
115 
116 Table with wrapped InChI column. (View with fixed font.)
117 
118 'InChI=1/CH4/h1H4'     !flammable!
119 'InChI=1/C2H2O4/c3-1   !toxic!
120 (4)2(5)6/h(H,3,4)(H,
121 5,6)'
122 'InChI=1/CH4O/c1-2/h   !flammable! !toxic!
123 2H,1H3'
124 'InChI=1/H2O/h1H2'
125 'InChI=1/C10H5ClN2/c   !no information!
126 11-10-4-2-1-3-9(10)5
127 -8(6-12)7-13/h1-5H'
128 
129 Quoted text in emails (but InChI is preserved after one break only).
130 > "InChI=1/C4H7N3OS/c1-7(8)4-9-5-2-3-6-9/h
131 > 2-4,8H,1H3/p+1/fC4H8N3OS/h5H/q+1/t9?"
132 >> "InChI=1/C4H7N3OS/c1-7(8)4-9-5-2-3-6-9/
133 >> h2-4,8H,1H3/p+1/fC4H8N3OS/h5H/q+1/t9?"
134 
135 Column width can be 1 if there is no extraneous text other than whitespace.
136 (When there is an extraneous string with NICs the minimum column width is 2).
137 '
138 I
139 n
140 C
141 h
142 I
143 =
144 1
145 /
146 C
147 l
148 H
149 /
150 h
151 1
152 H
153 '
154 */
155 
GetInChI(istream & is)156 string GetInChI(istream& is)
157 {
158   string prefix("InChI=");
159   string result;
160   enum statetype {before_inchi, match_inchi, unquoted, quoted};
161   statetype state = before_inchi;
162   char ch, lastch=0, qch=0;
163   size_t split_pos = 0;
164   bool inelement=false, afterelement=false;
165 
166   while((ch=is.get())!=EOF)
167   {
168     if(state==before_inchi)
169     {
170       if(ch>=0 && !isspace(ch))
171       {
172         if(ch==prefix[0])
173         {
174           result += ch;
175           state = match_inchi;
176           qch = lastch;
177         }
178       }
179       lastch = ch;
180     }
181 
182     else if(ch=='<')
183     {
184       // Ignore the content of any <...> elements
185       // But a second consecutive  <...> element terminates an unquoted InChI
186       if(afterelement && state==unquoted)
187           return result;
188       inelement=true;
189     }
190     else if(inelement)
191     {
192       if(afterelement)
193       {
194         //Now  reading after a <...> inserted in the InChI string
195         //Neglect whitespace, but any other character reverts to normal InChI parsing
196         if(ch<0 || !isspace(ch))
197         {
198           is.unget();
199           afterelement=false;
200           inelement=false;
201         }
202       }
203       else
204       {
205         if(ch=='>')
206           afterelement=true; //look for whitespace after end of element
207       }
208     }
209 
210     else if(ch>=0 && isspace(ch))
211     {
212       if(state==unquoted)
213         return result;
214     }
215 
216     else if(isnic(ch))
217     {
218       if(ch==qch && state!=match_inchi)
219         return result;
220       if(split_pos!=0)
221         result.erase(split_pos);
222       split_pos = result.size();
223     }
224 
225     else
226     {
227       result += ch;
228       if(state==match_inchi)
229       {
230         if(prefix.compare(0,result.size(),result)==0) //true if correct
231         {
232           if(result.size()==prefix.size())
233             state = isnic(qch)&& qch!='>' ? quoted : unquoted;
234         }
235         else
236         {
237           is.unget(); //It may be the start of a real "InChI="
238           result.erase();
239           state = before_inchi;
240         }
241       }
242     }
243   }
244   return result;
245 }
246 
247 } //namespace
248