1 /*
2 smilesto.c
3
4 converts SMILES strings to connection tables
5
6 Simon Kilvington, University of Southampton, 1995
7 */
8
9 #include "bbltyp.h"
10
11 #define isatomchr(C) (isalpha(C) || (C) == '*')
12
13 /*
14 smilestocontab
15 uses a finite state machine to convert a SMILES string to a connection table
16 returns NULL if any trouble (invalid SMILES or no memory) or a ptr to the alloc'd fragment
17 only the first molecule specified in the string will be constructed, terminators are fullstop, space, newline, and end of string
18 differences between the SMILES used by this and "real" SMILES strings are:
19 1. charges, isotope numbers, and attached hydrogens can not be specified inside an atoms brackets, eg [NH4+]
20 2. as coords are not generated symmetry symbols (ie @, @@, /, and \) can not be used
21 !!!!!!!!!!!! change it so these are just ignored rather than causing parsing errors !!!!!!!!!!!!!!!!!
22 */
23
24 #define MAXSMILESRINGS 100 /* max ring closure number +1 */
25
26 smilescontab_t *
smilestocontab(char * smiles)27 smilestocontab(char *smiles)
28 {
29 int i, b, rno, loss, lastatom, maxbranches, bstackptr, *bstack = NULL, ringatom[MAXSMILESRINGS];
30 char atname[2], *bord = NULL;
31 smilescontab_t *frag = NULL;
32 smilesatom_t *aptr = NULL;
33 smilesbond_t nextbond;
34 int toggle;
35
36 static const char bondchr[] = "~-=#:";
37
38 /* the states */
39 enum state_typ { St_Start, St_Name1, St_OpenSquare, St_Name2, St_CloseSquare, St_Number, St_CloseBranch, St_OpenBranch, St_Bond, St_End, St_Error } state;
40
41 /* the tokens */
42 enum {Tok_Symbol, Tok_Bond, Tok_Number, Tok_OpenSquare, Tok_CloseSquare, Tok_OpenBranch, Tok_CloseBranch, Tok_End };
43
44 /* the state machine */
45 static const int next[9][8] = { {St_Name1, St_Error, St_Error, St_OpenSquare, St_Error, St_Error, St_Error, St_End},
46 {St_Name1, St_Bond, St_Number, St_OpenSquare, St_Error, St_OpenBranch, St_CloseBranch, St_End},
47 {St_Name2, St_Error, St_Error, St_Error, St_Error, St_Error, St_Error, St_Error},
48 {St_Error, St_Error, St_Error, St_Error, St_CloseSquare, St_Error, St_Error, St_Error},
49 {St_Name1, St_Bond, St_Number, St_OpenSquare, St_Error, St_OpenBranch, St_CloseBranch, St_End},
50 {St_Name1, St_Bond, St_Number, St_OpenSquare, St_Error, St_OpenBranch, St_CloseBranch, St_End},
51 {St_Name1, St_Bond, St_Error, St_OpenSquare, St_Error, St_OpenBranch, St_CloseBranch, St_End},
52 {St_Name1, St_Bond, St_Error, St_OpenSquare, St_Error, St_Error, St_Error, St_Error},
53 {St_Name1, St_Error, St_Error, St_OpenSquare, St_Error, St_Error, St_Error, St_Error} };
54
55 for(i=0; i<MAXSMILESRINGS; i++)
56 ringatom[i] = -1;
57
58 if(!(frag = malloc(smiles_CONTABHDRSIZE)))
59 {
60 printf("No memory to convert SMILES string\n");
61 return NULL;
62 }
63 frag->natoms = 0;
64
65 maxbranches = strutils_noccurrences(smiles, '(');
66 if (maxbranches > 0)
67 if(!(bstack = malloc(maxbranches * sizeof(int))))
68 {
69 printf("No memory to convert SMILES string\n");
70 free(frag);
71 return NULL;
72 }
73
74 i = 0; /* index into smiles string */
75 bstackptr = 0; /* no of branches on the stack */
76 state = St_Start;
77 while(state != St_Error && state != St_End)
78 {
79 /* see what the next token in the string is, and move onto the appropriate state */
80 if(strchr(". \n", smiles[i]) != NULL) state = (enum state_typ) next[state][Tok_End];
81 else if(isatomchr(smiles[i])) state = (enum state_typ) next[state][Tok_Symbol];
82 else if(bord = strchr(bondchr, smiles[i])) state = (enum state_typ) next[state][Tok_Bond];
83 else if(isdigit(smiles[i]) || smiles[i] == '%') state = (enum state_typ) next[state][Tok_Number];
84 else if(smiles[i] == '[') state = (enum state_typ) next[state][Tok_OpenSquare];
85 else if(smiles[i] == ']') state = (enum state_typ) next[state][Tok_CloseSquare];
86 else if(smiles[i] == '(') state = (enum state_typ) next[state][Tok_OpenBranch];
87 else if(smiles[i] == ')') state = (enum state_typ) next[state][Tok_CloseBranch];
88 else state = St_Error; /* invalid character */
89
90 /* do we need to do anything in this state */
91 switch(state)
92 {
93 case St_Name1:
94 /* check it's not "Cl" or "Br" */
95 if(strncmp(&smiles[i], "Cl", 2) == 0 || strncmp(&smiles[i], "Br", 2) == 0)
96 {
97 atname[0] = smiles[i];
98 atname[1] = smiles[i+1];
99 i+=2;
100 }
101 else
102 {
103 atname[0] = smiles[i];
104 atname[1] = '\0';
105 i++;
106 }
107 if(!addatomtocontab(&frag, atname, &lastatom, nextbond))
108 state = St_Error;
109 nextbond = SMILESBOND_Single;
110 break;
111
112 case St_Name2:
113 if(smiles[i+1] == ']')
114 {
115 atname[0] = smiles[i];
116 atname[1] = '\0';
117 i++;
118 }
119 else
120 {
121 atname[0] = smiles[i];
122 atname[1] = smiles[i+1];
123 i += 2;
124 }
125 if(!addatomtocontab(&frag, atname, &lastatom, nextbond))
126 state = St_Error;
127 nextbond = SMILESBOND_Single;
128 break;
129
130 case St_Number:
131 if(smiles[i] == '%')
132 {
133 rno = (10 * (smiles[i+1] - '0')) + (smiles[i+2] - '0');
134 i += 3;
135 }
136 else
137 {
138 rno = smiles[i] - '0';
139 i++;
140 }
141 if(ringatom[rno] == -1) /* start of ring */
142 {
143 ringatom[rno] = lastatom;
144 }
145 else /* end of ring */
146 {
147 if(!closecontabring(frag, ringatom[rno], lastatom))
148 state = St_Error;
149 ringatom[rno] = -1;
150 }
151 break;
152
153 case St_Bond:
154 nextbond = (bord - bondchr);
155 i++;
156 break;
157
158 case St_OpenBranch:
159 bstack[bstackptr++] = lastatom;
160 i++;
161 break;
162
163 case St_CloseBranch:
164 if(bstackptr == 0)
165 {
166 printf("Unmatched closing branch bracket in SMILES string at character %d\n", i+1);
167 state = St_Error;
168 }
169 else
170 {
171 lastatom = bstack[--bstackptr];
172 i++;
173 }
174 break;
175
176 case St_End:
177 if(frag->natoms == 0)
178 {
179 printf("SMILES string contains no atoms\n");
180 state = St_Error;
181 }
182 break;
183
184 case St_Error:
185 printf("Error parsing SMILES string at character %d \"%c\"\n", i+1, smiles[i]);
186 break;
187
188 default:
189 i++;
190 break;
191 }
192 }
193
194 /* check we are not in the middle of any rings or branches */
195 if(state != St_Error && bstackptr > 0)
196 {
197 printf("Unmatched brackets in SMILES string\n");
198 state = St_Error;
199 }
200
201 for(i=0; state != St_Error && i<MAXSMILESRINGS; i++)
202 {
203 if(ringatom[i] != -1)
204 {
205 printf("Unmatched ring closure number (%d) in SMILES string\n", i);
206 state = St_Error;
207 }
208 }
209
210 if(state == St_Error)
211 {
212 /* clean up */
213 if (frag)
214 {
215 free(frag);
216 frag = NULL;
217 }
218
219 }
220 else
221 {
222 /* convert all the chemical symbols to the correct case */
223 for(i=0; i<frag->natoms; i++)
224 {
225 aptr = &frag->atom[i];
226 aptr->symbol[0] = toupper(aptr->symbol[0]);
227 aptr->symbol[1] = tolower(aptr->symbol[1]);
228 }
229 }
230
231 if (bstack)
232 {
233 free(bstack);
234 bstack = NULL;
235 }
236
237 return frag;
238 }
239
240 #undef MAXSMILESRINGS
241
242 /*
243 addatomtocontab
244 adds an extra atom to *frag and bonds it to *lastatom (unless the new atom is the first one) with the given bond type
245 if bondtype is SINGLE and both atoms have lower case names an AROMATIC bond is used instead
246 updates *lastatom to be the index of the atom just added (ie the last atom in the fragment)
247 returns FALSE if any trouble (no memory, unknown atom type, or too many bonds to *lastatom)
248 */
249
250 int
addatomtocontab(smilescontab_t ** frag,char * atname,int * lastatom,smilesbond_t bondtype)251 addatomtocontab(smilescontab_t **frag, char *atname, int *lastatom, smilesbond_t bondtype)
252 {
253 smilescontab_t *newptr;
254 int type, newatom, b, btype, bbits;
255 smilesatom_t *aptr, *bptr;
256 char upper[2], pdbatm[8];
257 int uppercase, err;
258
259 err = FALSE;
260
261 uppercase = !aromaticsmilessym(atname);
262 upper[0] = toupper(atname[0]);
263 upper[1] = toupper(atname[1]);
264
265 if(!uppercase && bondtype == SMILESBOND_Single)
266 {
267 if(aromaticsmilessym((*frag)->atom[*lastatom].symbol))
268 bondtype = SMILESBOND_Aromatic;
269 }
270
271 newatom = (*frag)->natoms;
272
273 if(!(newptr = realloc((*frag), smiles_CONTABHDRSIZE + ((*frag)->natoms + 1) * sizeof(smilesatom_t))))
274 {
275 printf("No memory to build fragment\n");
276 return FALSE;
277 }
278 (*frag) = newptr;
279 (*frag)->natoms ++;
280
281 /* set up "newatom" and bond it to "*lastatom" (unless newatom is the first) */
282 aptr = &((*frag)->atom[newatom]);
283 strncpy(aptr->symbol, atname, 2);
284 if(newatom == 0)
285 {
286 for(b=0; b<md_MAXBONDS; b++)
287 {
288 aptr->bondedto[b] = md_NOBOND;
289 aptr->bondtype[b] = SMILESBOND_NoBond;
290 }
291 }
292 else
293 {
294 bptr = &((*frag)->atom[*lastatom]);
295 b = nextfreebondto(bptr);
296 if(b != -1)
297 {
298 bptr->bondedto[b] = newatom;
299 bptr->bondtype[b] = bondtype;
300 aptr->bondedto[0] = *lastatom;
301 aptr->bondtype[0] = bondtype;
302 for(b=1; b<md_MAXBONDS; b++)
303 {
304 aptr->bondedto[b] = md_NOBOND;
305 aptr->bondtype[b] = SMILESBOND_NoBond;
306 }
307 }
308 else
309 {
310 printf("Too many bonds to atom %d\n", (*lastatom)+1);
311 err = TRUE;
312 }
313 }
314
315 /* set up the return values */
316 *lastatom = newatom;
317
318 return !err;
319 }
320
321 /*
322 closecontabring
323 bonds atom1 and atom2 using either a SINGLE or AROMATIC bond depending on whether the names of both are lower case or not
324 returns FALSE if any trouble (too many bonds to one of the atoms)
325 */
326
327 int
closecontabring(smilescontab_t * frag,int atom1,int atom2)328 closecontabring(smilescontab_t *frag, int atom1, int atom2)
329 {
330 int bto1, bto2;
331 smilesatom_t *a1ptr, *a2ptr;
332 smilesbond_t btype;
333 int lower1, lower2;
334
335 a1ptr = &frag->atom[atom1];
336 a2ptr = &frag->atom[atom2];
337
338 if((bto1 = nextfreebondto(a1ptr)) == -1)
339 {
340 printf("Too many bonds to atom %d\n", atom1+1);
341 return FALSE;
342 }
343 if((bto2 = nextfreebondto(a2ptr)) == -1)
344 {
345 printf("Too many bonds to atom %d\n", atom2+1);
346 return FALSE;
347 }
348
349 lower1 = aromaticsmilessym(a1ptr->symbol);
350 lower2 = aromaticsmilessym(a2ptr->symbol);
351
352 btype = (lower1 && lower2) ? SMILESBOND_Aromatic : SMILESBOND_Single;
353
354 a1ptr->bondedto[bto1] = atom2;
355 a1ptr->bondtype[bto1] = btype;
356
357 a2ptr->bondedto[bto2] = atom1;
358 a2ptr->bondtype[bto2] = btype;
359
360 return TRUE;
361 }
362
363 /*
364 aromaticsmilessym
365 returns TRUE if the given SMILES atom symbol is lower case, and therefore aromatic
366 */
367
368 int
aromaticsmilessym(char * sym)369 aromaticsmilessym(char *sym)
370 {
371 return (sym[0] == tolower(sym[0])); /* as metals may be called 'Zn' etc */
372 }
373
374 /*
375 nextfreebondto
376 returns index of first bondedto[] that is md_NOBOND, or -1 if there are no free bonds
377 */
378
379 int
nextfreebondto(smilesatom_t * aptr)380 nextfreebondto(smilesatom_t *aptr)
381 {
382 int f;
383 int freebond = FALSE;
384
385 for(f=0; f<md_MAXBONDS && !freebond; f++)
386 freebond = (aptr->bondedto[f] == md_NOBOND);
387
388 return (freebond) ? f-1 : -1;
389 }
390
391 /*
392 strutils_noccurrances
393 counts the number of times the given character appears in the given string
394 */
395
396 int
strutils_noccurrences(char * buffer,char c)397 strutils_noccurrences(char *buffer, char c)
398 {
399 int i, n;
400
401 i = n = 0;
402 while(buffer[i] != '\0')
403 {
404 n += (buffer[i] == c);
405 i++;
406 }
407
408 return n;
409 }
410