1 /*
2    smilesto.c
3 
4    converts SMILES strings to connection tables
5 
6    Simon Kilvington, University of Southampton, 1995
7 */
8 
9 #include "bbltyp.h"
10 
11 #define isatomchr(C)		(isalpha(C) || (C) == '*')
12 
13 /*
14    smilestocontab
15    uses a finite state machine to convert a SMILES string to a connection table
16    returns NULL if any trouble (invalid SMILES or no memory) or a ptr to the alloc'd fragment
17    only the first molecule specified in the string will be constructed, terminators are fullstop, space, newline, and end of string
18    differences between the SMILES used by this and "real" SMILES strings are:
19    1. charges, isotope numbers, and attached hydrogens can not be specified inside an atoms brackets, eg [NH4+]
20    2. as coords are not generated symmetry symbols (ie @, @@, /, and \) can not be used
21 !!!!!!!!!!!! change it so these are just ignored rather than causing parsing errors !!!!!!!!!!!!!!!!!
22 */
23 
24 #define MAXSMILESRINGS 100	/* max ring closure number +1 */
25 
26 smilescontab_t *
smilestocontab(char * smiles)27 smilestocontab(char *smiles)
28 {
29    int i, b, rno, loss, lastatom, maxbranches, bstackptr, *bstack = NULL, ringatom[MAXSMILESRINGS];
30    char atname[2], *bord = NULL;
31    smilescontab_t *frag = NULL;
32    smilesatom_t *aptr = NULL;
33    smilesbond_t nextbond;
34    int toggle;
35 
36    static const char bondchr[] = "~-=#:";
37 
38 /* the states */
39    enum state_typ { St_Start, St_Name1, St_OpenSquare, St_Name2, St_CloseSquare, St_Number, St_CloseBranch, St_OpenBranch, St_Bond, St_End, St_Error } state;
40 
41 /* the tokens */
42    enum {Tok_Symbol, Tok_Bond, Tok_Number, Tok_OpenSquare, Tok_CloseSquare, Tok_OpenBranch, Tok_CloseBranch, Tok_End };
43 
44 /* the state machine */
45    static const int next[9][8] = { {St_Name1, St_Error, St_Error,  St_OpenSquare, St_Error,       St_Error,      St_Error,       St_End},
46 				   {St_Name1, St_Bond,  St_Number, St_OpenSquare, St_Error,       St_OpenBranch, St_CloseBranch, St_End},
47 				   {St_Name2, St_Error, St_Error,  St_Error,      St_Error,       St_Error,      St_Error,       St_Error},
48 				   {St_Error, St_Error, St_Error,  St_Error,      St_CloseSquare, St_Error,      St_Error,       St_Error},
49 				   {St_Name1, St_Bond,  St_Number, St_OpenSquare, St_Error,       St_OpenBranch, St_CloseBranch, St_End},
50 				   {St_Name1, St_Bond,  St_Number, St_OpenSquare, St_Error,       St_OpenBranch, St_CloseBranch, St_End},
51 				   {St_Name1, St_Bond,  St_Error,  St_OpenSquare, St_Error,       St_OpenBranch, St_CloseBranch, St_End},
52 				   {St_Name1, St_Bond,  St_Error,  St_OpenSquare, St_Error,       St_Error,      St_Error,       St_Error},
53 				   {St_Name1, St_Error, St_Error,  St_OpenSquare, St_Error,       St_Error,      St_Error,       St_Error}  };
54 
55    for(i=0; i<MAXSMILESRINGS; i++)
56       ringatom[i] = -1;
57 
58    if(!(frag = malloc(smiles_CONTABHDRSIZE)))
59    {
60       printf("No memory to convert SMILES string\n");
61       return NULL;
62    }
63    frag->natoms = 0;
64 
65    maxbranches = strutils_noccurrences(smiles, '(');
66    if (maxbranches > 0)
67      if(!(bstack = malloc(maxbranches * sizeof(int))))
68      {
69        printf("No memory to convert SMILES string\n");
70        free(frag);
71        return NULL;
72      }
73 
74    i = 0;			/* index into smiles string */
75    bstackptr = 0;		/* no of branches on the stack */
76    state = St_Start;
77    while(state != St_Error && state != St_End)
78    {
79    /* see what the next token in the string is, and move onto the appropriate state */
80       if(strchr(". \n", smiles[i]) != NULL)		state = (enum state_typ) next[state][Tok_End];
81       else if(isatomchr(smiles[i]))			state = (enum state_typ) next[state][Tok_Symbol];
82       else if(bord = strchr(bondchr, smiles[i]))	state = (enum state_typ) next[state][Tok_Bond];
83       else if(isdigit(smiles[i]) || smiles[i] == '%')	state = (enum state_typ) next[state][Tok_Number];
84       else if(smiles[i] == '[')				state = (enum state_typ) next[state][Tok_OpenSquare];
85       else if(smiles[i] == ']')				state = (enum state_typ) next[state][Tok_CloseSquare];
86       else if(smiles[i] == '(')				state = (enum state_typ) next[state][Tok_OpenBranch];
87       else if(smiles[i] == ')')				state = (enum state_typ) next[state][Tok_CloseBranch];
88       else 						state = St_Error;			/* invalid character */
89 
90    /* do we need to do anything in this state */
91       switch(state)
92       {
93       case St_Name1:
94       /* check it's not "Cl" or "Br" */
95 	 if(strncmp(&smiles[i], "Cl", 2) == 0 || strncmp(&smiles[i], "Br", 2) == 0)
96 	 {
97 	    atname[0] = smiles[i];
98 	    atname[1] = smiles[i+1];
99 	    i+=2;
100 	 }
101 	 else
102 	 {
103 	    atname[0] = smiles[i];
104             atname[1] = '\0';
105 	    i++;
106 	 }
107 	 if(!addatomtocontab(&frag, atname, &lastatom, nextbond))
108 	    state = St_Error;
109 	 nextbond = SMILESBOND_Single;
110 	 break;
111 
112       case St_Name2:
113 	 if(smiles[i+1] == ']')
114 	 {
115 	    atname[0] = smiles[i];
116 	    atname[1] = '\0';
117 	    i++;
118 	 }
119 	 else
120 	 {
121 	    atname[0] = smiles[i];
122 	    atname[1] = smiles[i+1];
123 	    i += 2;
124 	 }
125 	 if(!addatomtocontab(&frag, atname, &lastatom, nextbond))
126 	    state = St_Error;
127 	 nextbond = SMILESBOND_Single;
128 	 break;
129 
130       case St_Number:
131 	 if(smiles[i] == '%')
132 	 {
133 	    rno = (10 * (smiles[i+1] - '0')) + (smiles[i+2] - '0');
134 	    i += 3;
135 	 }
136 	 else
137 	 {
138 	    rno = smiles[i] - '0';
139 	    i++;
140 	 }
141 	 if(ringatom[rno] == -1)		/* start of ring */
142 	 {
143 	    ringatom[rno] = lastatom;
144 	 }
145 	 else					/* end of ring */
146 	 {
147 	    if(!closecontabring(frag, ringatom[rno], lastatom))
148 	       state = St_Error;
149 	    ringatom[rno] = -1;
150 	 }
151 	 break;
152 
153       case St_Bond:
154 	 nextbond = (bord - bondchr);
155 	 i++;
156 	 break;
157 
158       case St_OpenBranch:
159 	 bstack[bstackptr++] = lastatom;
160 	 i++;
161 	 break;
162 
163       case St_CloseBranch:
164 	 if(bstackptr == 0)
165 	 {
166 	    printf("Unmatched closing branch bracket in SMILES string at character %d\n", i+1);
167 	    state = St_Error;
168 	 }
169 	 else
170 	 {
171 	    lastatom = bstack[--bstackptr];
172 	    i++;
173 	 }
174 	 break;
175 
176       case St_End:
177 	 if(frag->natoms == 0)
178 	 {
179 	    printf("SMILES string contains no atoms\n");
180 	    state = St_Error;
181 	 }
182 	 break;
183 
184       case St_Error:
185          printf("Error parsing SMILES string at character %d \"%c\"\n", i+1, smiles[i]);
186 	 break;
187 
188       default:
189 	 i++;
190 	 break;
191       }
192    }
193 
194 /* check we are not in the middle of any rings or branches */
195    if(state != St_Error && bstackptr > 0)
196    {
197       printf("Unmatched brackets in SMILES string\n");
198       state = St_Error;
199    }
200 
201    for(i=0; state != St_Error && i<MAXSMILESRINGS; i++)
202    {
203       if(ringatom[i] != -1)
204       {
205 	 printf("Unmatched ring closure number (%d) in SMILES string\n", i);
206 	 state = St_Error;
207       }
208    }
209 
210    if(state == St_Error)
211    {
212    /* clean up */
213      if (frag)
214      {
215        free(frag);
216        frag = NULL;
217      }
218 
219    }
220    else
221    {
222    /* convert all the chemical symbols to the correct case */
223       for(i=0; i<frag->natoms; i++)
224       {
225 	 aptr = &frag->atom[i];
226 	 aptr->symbol[0] = toupper(aptr->symbol[0]);
227 	 aptr->symbol[1] = tolower(aptr->symbol[1]);
228       }
229    }
230 
231    if (bstack)
232    {
233      free(bstack);
234      bstack = NULL;
235    }
236 
237    return frag;
238 }
239 
240 #undef MAXSMILESRINGS
241 
242 /*
243    addatomtocontab
244    adds an extra atom to *frag and bonds it to *lastatom (unless the new atom is the first one) with the given bond type
245    if bondtype is SINGLE and both atoms have lower case names an AROMATIC bond is used instead
246    updates *lastatom to be the index of the atom just added (ie the last atom in the fragment)
247    returns FALSE if any trouble (no memory, unknown atom type, or too many bonds to *lastatom)
248 */
249 
250 int
addatomtocontab(smilescontab_t ** frag,char * atname,int * lastatom,smilesbond_t bondtype)251 addatomtocontab(smilescontab_t **frag, char *atname, int *lastatom, smilesbond_t bondtype)
252 {
253    smilescontab_t *newptr;
254    int type, newatom, b, btype, bbits;
255    smilesatom_t *aptr, *bptr;
256    char upper[2], pdbatm[8];
257    int uppercase, err;
258 
259    err = FALSE;
260 
261    uppercase = !aromaticsmilessym(atname);
262    upper[0] = toupper(atname[0]);
263    upper[1] = toupper(atname[1]);
264 
265    if(!uppercase && bondtype == SMILESBOND_Single)
266    {
267       if(aromaticsmilessym((*frag)->atom[*lastatom].symbol))
268          bondtype = SMILESBOND_Aromatic;
269    }
270 
271    newatom = (*frag)->natoms;
272 
273    if(!(newptr = realloc((*frag), smiles_CONTABHDRSIZE + ((*frag)->natoms + 1) * sizeof(smilesatom_t))))
274    {
275       printf("No memory to build fragment\n");
276       return FALSE;
277    }
278    (*frag) = newptr;
279    (*frag)->natoms ++;
280 
281 /* set up "newatom" and bond it to "*lastatom" (unless newatom is the first) */
282    aptr = &((*frag)->atom[newatom]);
283    strncpy(aptr->symbol, atname, 2);
284    if(newatom == 0)
285    {
286       for(b=0; b<md_MAXBONDS; b++)
287       {
288 	 aptr->bondedto[b] = md_NOBOND;
289 	 aptr->bondtype[b] = SMILESBOND_NoBond;
290       }
291    }
292    else
293    {
294       bptr = &((*frag)->atom[*lastatom]);
295       b = nextfreebondto(bptr);
296       if(b != -1)
297       {
298 	 bptr->bondedto[b] = newatom;
299 	 bptr->bondtype[b] = bondtype;
300 	 aptr->bondedto[0] = *lastatom;
301 	 aptr->bondtype[0] = bondtype;
302 	 for(b=1; b<md_MAXBONDS; b++)
303 	 {
304 	    aptr->bondedto[b] = md_NOBOND;
305 	    aptr->bondtype[b] = SMILESBOND_NoBond;
306 	 }
307       }
308       else
309       {
310 	 printf("Too many bonds to atom %d\n", (*lastatom)+1);
311 	 err = TRUE;
312       }
313    }
314 
315 /* set up the return values */
316    *lastatom = newatom;
317 
318    return !err;
319 }
320 
321 /*
322    closecontabring
323    bonds atom1 and atom2 using either a SINGLE or AROMATIC bond depending on whether the names of both are lower case or not
324    returns FALSE if any trouble (too many bonds to one of the atoms)
325 */
326 
327 int
closecontabring(smilescontab_t * frag,int atom1,int atom2)328 closecontabring(smilescontab_t *frag, int atom1, int atom2)
329 {
330    int bto1, bto2;
331    smilesatom_t *a1ptr, *a2ptr;
332    smilesbond_t btype;
333    int lower1, lower2;
334 
335    a1ptr = &frag->atom[atom1];
336    a2ptr = &frag->atom[atom2];
337 
338    if((bto1 = nextfreebondto(a1ptr)) == -1)
339    {
340       printf("Too many bonds to atom %d\n", atom1+1);
341       return FALSE;
342    }
343    if((bto2 = nextfreebondto(a2ptr)) == -1)
344    {
345       printf("Too many bonds to atom %d\n", atom2+1);
346       return FALSE;
347    }
348 
349    lower1 = aromaticsmilessym(a1ptr->symbol);
350    lower2 = aromaticsmilessym(a2ptr->symbol);
351 
352    btype = (lower1 && lower2) ? SMILESBOND_Aromatic : SMILESBOND_Single;
353 
354    a1ptr->bondedto[bto1] = atom2;
355    a1ptr->bondtype[bto1] = btype;
356 
357    a2ptr->bondedto[bto2] = atom1;
358    a2ptr->bondtype[bto2] = btype;
359 
360    return TRUE;
361 }
362 
363 /*
364    aromaticsmilessym
365    returns TRUE if the given SMILES atom symbol is lower case, and therefore aromatic
366 */
367 
368 int
aromaticsmilessym(char * sym)369 aromaticsmilessym(char *sym)
370 {
371    return (sym[0] == tolower(sym[0]));		/* as metals may be called 'Zn' etc */
372 }
373 
374 /*
375    nextfreebondto
376    returns index of first bondedto[] that is md_NOBOND, or -1 if there are no free bonds
377 */
378 
379 int
nextfreebondto(smilesatom_t * aptr)380 nextfreebondto(smilesatom_t *aptr)
381 {
382    int f;
383    int freebond = FALSE;
384 
385    for(f=0; f<md_MAXBONDS && !freebond; f++)
386       freebond = (aptr->bondedto[f] == md_NOBOND);
387 
388    return (freebond) ? f-1 : -1;
389 }
390 
391 /*
392    strutils_noccurrances
393    counts the number of times the given character appears in the given string
394 */
395 
396 int
strutils_noccurrences(char * buffer,char c)397 strutils_noccurrences(char *buffer, char c)
398 {
399    int i, n;
400 
401    i = n = 0;
402    while(buffer[i] != '\0')
403    {
404       n += (buffer[i] == c);
405       i++;
406    }
407 
408    return n;
409 }
410