1 /**********************************************************************
2 Copyright (C) 2006 Chris Morley
3
4 This file is part of the Open Babel project.
5 For more information, see <http://openbabel.org/>
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation version 2 of the License.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15 ***********************************************************************/
16
17 #include <openbabel/babelconfig.h>
18 #include <fstream>
19 #include <string>
20
21 // This macro is used in DLL builds. If it has not
22 // been set in babelconfig.h, define it as nothing.
23 #ifndef OBCOMMON
24 #define OBCOMMON
25 #endif
26
27 using namespace std;
28 namespace OpenBabel
29 {
30
31 ///Returns true if character is not one used in an InChI.
isnic(char ch)32 bool isnic(char ch)
33 {
34 //This set of characters could be extended
35 static std::string nic("\"\'\\@<>!$%&{}[]");
36 return ch<0 || nic.find(ch)!=std::string::npos;
37 };
38
39 /// @brief Reads an InChI (possibly split) from an input stream and returns it as unsplit text.
40 /// The input stream is left after the end of the extracted InChI ready to look for the next one.
41 std::string GetInChI(std::istream& is);
42
43 /*!
44 This function recovers a normal InChI from an input stream which
45 contains other arbitrary text. The InChI string can have
46 extraneous characters inserted, for example because of word wrapping,
47 provided it follows certain rules.
48
49 When this file (getinchi.cpp) is read, 15 InChIs will be extracted, e.g.
50 babel -iinchi getinchi.cpp -osmi
51
52 Inside an InChI string ignore anything between < and >
53 This means that an InChI string can be split up by inserting any number of <br /> elements:
54 InChI=1/C18H25NO6S/c1-14-9-11-15(12-10-14)26(22,23)19(17(21)25-18(2,3)4)13<br />-7-6-8-16(20)24-5/h6,8-12H,7,13H2,1-5H3/b8-6-
55
56 Any whitespace after the > is also ignored, so that newline characters can be added:
57 InChI=1/C29H33NO4Si/c1-5-32-28(31)26-25(34-27(30-26)22-15-9-6-10-16-22)<br />
58 21-33-35(29(2,3)4,23-17-11-7-12-18-23)24-19-13-8-14-20-24<br />
59 /h6-20,25-26H,5,21H2,1-4H3/t25-,26-/m0/s1
60
61 A second consecutive <...> element ends an unquoted InChI string:
62 <p>
63 <small>InChI=1/C47H58N2O10SSi/c1-10-56-43(51)47(36(32-41(50)55-9)30-31-49(44(52)59-45<br />
64 (3,4)5)60(53,54)37-28-26-34(2)27-29-37)40<br />
65 (58-42(48-47)35-20-14-11-15-21-35)33-57-61(46(6,7)8,38-22-16-12-17-23-38)39-24-<br />
66 18-13-19-25-39/h11-29,36,40H,10,30-33H2,1-9H3<br />
67 /t36-,40-,47-/m0/s1</small>
68 </p>
69
70 Dmitrii Tchekhovskoi made a proposal for "InChI hyphenation" or "quoted InChI".
71 http://sourceforge.net/mailarchive/forum.php?thread_id=10200459&forum_id=45166
72 This proposal has not been followed up probably because InChKey was introduced.
73
74 However this function GetInChI() parses quoted InChIs of this form.
75 It also extends this proposal, allowing a wider range of corrupted InChIs to be accepted.
76
77 The original proposal was essentially:
78 - When an InChI string is enclosed by " quote characters,
79 any whitespace characters it contains (including new lines) are
80 ignored.
81 - Other extraneous strings can also be ignored, but this
82 is system dependent.
83 - The "InChI=" cannot be split.
84
85 The extensions are:
86 - The character that encloses a quoted InChI does not have to be "
87 and can be any character that is not used in InChI - a NIC
88 [never miss the opportunity for a TLA!]. This means that
89 conflicts in systems which have other uses for the quote character
90 can be avoided.
91 As a special case, '>' is not allowed as a quote character because InChI
92 strings in HTML commonly start after <...> elements.
93 - As well as whitespace characters (which are ignored), a quoted
94 InChI can contain an extraneous string which starts and ends with
95 a NIC. This allows inserted strings like <br /> to be ignored.
96 However, only one such extraneous string is allowed.
97 - There are no restrictions on splitting "InChI=" by whitespace
98 characters, allowing a minimum column width of 1.
99 If the splitting were by an extraneous string the minimum column
100 width is 2.
101
102 The following are some examples of split InChIs.
103
104 First two unbroken examples, the first is unquoted
105 InChI=1/CH4/h1H4 methane
106 "InChI=1/C4H10O/c1-3-5-4-2/h3-4H2,1-2H3" diethyl ether
107
108 Multiple white space splitting
109 @InChI=1/C15H14O3/c1-11(15(16)17)18-
110 14-10-6-5-9-13(14)12-7-3-2-4-8-12/h2
111 -11H,1H3,(H,16,17)@
112
113 Split with extraneous text, which starts and ends with a non-InChI character
114 'InChI=1/C2H6O/c1-2-<br />3/h3H,2H2,1H3'
115
116 Table with wrapped InChI column. (View with fixed font.)
117
118 'InChI=1/CH4/h1H4' !flammable!
119 'InChI=1/C2H2O4/c3-1 !toxic!
120 (4)2(5)6/h(H,3,4)(H,
121 5,6)'
122 'InChI=1/CH4O/c1-2/h !flammable! !toxic!
123 2H,1H3'
124 'InChI=1/H2O/h1H2'
125 'InChI=1/C10H5ClN2/c !no information!
126 11-10-4-2-1-3-9(10)5
127 -8(6-12)7-13/h1-5H'
128
129 Quoted text in emails (but InChI is preserved after one break only).
130 > "InChI=1/C4H7N3OS/c1-7(8)4-9-5-2-3-6-9/h
131 > 2-4,8H,1H3/p+1/fC4H8N3OS/h5H/q+1/t9?"
132 >> "InChI=1/C4H7N3OS/c1-7(8)4-9-5-2-3-6-9/
133 >> h2-4,8H,1H3/p+1/fC4H8N3OS/h5H/q+1/t9?"
134
135 Column width can be 1 if there is no extraneous text other than whitespace.
136 (When there is an extraneous string with NICs the minimum column width is 2).
137 '
138 I
139 n
140 C
141 h
142 I
143 =
144 1
145 /
146 C
147 l
148 H
149 /
150 h
151 1
152 H
153 '
154 */
155
GetInChI(istream & is)156 string GetInChI(istream& is)
157 {
158 string prefix("InChI=");
159 string result;
160 enum statetype {before_inchi, match_inchi, unquoted, quoted};
161 statetype state = before_inchi;
162 char ch, lastch=0, qch=0;
163 size_t split_pos = 0;
164 bool inelement=false, afterelement=false;
165
166 while((ch=is.get())!=EOF)
167 {
168 if(state==before_inchi)
169 {
170 if(ch>=0 && !isspace(ch))
171 {
172 if(ch==prefix[0])
173 {
174 result += ch;
175 state = match_inchi;
176 qch = lastch;
177 }
178 }
179 lastch = ch;
180 }
181
182 else if(ch=='<')
183 {
184 // Ignore the content of any <...> elements
185 // But a second consecutive <...> element terminates an unquoted InChI
186 if(afterelement && state==unquoted)
187 return result;
188 inelement=true;
189 }
190 else if(inelement)
191 {
192 if(afterelement)
193 {
194 //Now reading after a <...> inserted in the InChI string
195 //Neglect whitespace, but any other character reverts to normal InChI parsing
196 if(ch<0 || !isspace(ch))
197 {
198 is.unget();
199 afterelement=false;
200 inelement=false;
201 }
202 }
203 else
204 {
205 if(ch=='>')
206 afterelement=true; //look for whitespace after end of element
207 }
208 }
209
210 else if(ch>=0 && isspace(ch))
211 {
212 if(state==unquoted)
213 return result;
214 }
215
216 else if(isnic(ch))
217 {
218 if(ch==qch && state!=match_inchi)
219 return result;
220 if(split_pos!=0)
221 result.erase(split_pos);
222 split_pos = result.size();
223 }
224
225 else
226 {
227 result += ch;
228 if(state==match_inchi)
229 {
230 if(prefix.compare(0,result.size(),result)==0) //true if correct
231 {
232 if(result.size()==prefix.size())
233 state = isnic(qch)&& qch!='>' ? quoted : unquoted;
234 }
235 else
236 {
237 is.unget(); //It may be the start of a real "InChI="
238 result.erase();
239 state = before_inchi;
240 }
241 }
242 }
243 }
244 return result;
245 }
246
247 } //namespace
248