1 /**********************************************************************
2 parsmart.cpp - SMARTS parser.
3 
4 Copyright (C) 1998-2001 by OpenEye Scientific Software, Inc.
5 Some portions Copyright (C) 2001-2006 by Geoffrey R. Hutchison
6 
7 This file is part of the Open Babel project.
8 For more information, see <http://openbabel.org/>
9 
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation version 2 of the License.
13 
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 GNU General Public License for more details.
18 ***********************************************************************/
19 #include <openbabel/babelconfig.h>
20 
21 #include <ctype.h>
22 #include <iomanip>
23 #include <cstring>
24 
25 #include <openbabel/mol.h>
26 #include <openbabel/atom.h>
27 #include <openbabel/bond.h>
28 #include <openbabel/parsmart.h>
29 #include <openbabel/stereo/stereo.h>
30 #include <openbabel/stereo/tetrahedral.h>
31 
32 using namespace std;
33 
34 namespace OpenBabel
35 {
36   /*! \class OBSmartsPattern parsmart.h <openbabel/parsmart.h>
37 
38     Substructure search is an incredibly useful tool in the context of a
39     small molecule programming library. Having an efficient substructure
40     search engine reduces the amount of hard code needed for molecule
41     perception, as well as increases the flexibility of certain
42     operations. For instance, atom typing can be easily performed based on
43     hard coded rules of element type and bond orders (or
44     hybridization). Alternatively, atom typing can also be done by
45     matching a set of substructure rules read at run time. In the latter
46     case customization based on application (such as changing the pH)
47     becomes a facile operation. Fortunately for Open Babel and its users,
48     Roger Sayle donated a SMARTS parser which became the basis for SMARTS
49     matching in Open Babel.
50 
51     For more information on the SMARTS support in Open Babel, see the wiki page:
52     http://openbabel.org/wiki/SMARTS
53 
54     The SMARTS matcher, or OBSmartsPattern, is a separate object which can
55     match patterns in the OBMol class. The following code demonstrates how
56     to use the OBSmartsPattern class:
57     \code
58     OBMol mol;
59     ...
60     OBSmartsPattern sp;
61     sp.Init("CC");
62     sp.Match(mol);
63     vector<vector<int> > maplist;
64     maplist = sp.GetMapList();
65     //or maplist = sp.GetUMapList();
66     //print out the results
67     vector<vector<int> >::iterator i;
68     vector<int>::iterator j;
69     for (i = maplist.begin();i != maplist.end();++i)
70     {
71     for (j = i->begin();j != i->end();++j)
72     cout << j << ' `;
73     cout << endl;
74     }
75     \endcode
76 
77     The preceding code reads in a molecule, initializes a SMARTS pattern
78     of two single-bonded carbons, and locates all instances of the
79     pattern in the molecule. Note that calling the Match() function
80     does not return the results of the substructure match. The results
81     from a match are stored in the OBSmartsPattern, and a call to
82     GetMapList() or GetUMapList() must be made to extract the
83     results. The function GetMapList() returns all matches of a
84     particular pattern while GetUMapList() returns only the unique
85     matches. For instance, the pattern [OD1]~C~[OD1] describes a
86     carboxylate group. This pattern will match both atom number
87     permutations of the carboxylate, and if GetMapList() is called, both
88     matches will be returned. If GetUMapList() is called only unique
89     matches of the pattern will be returned. A unique match is defined as
90     one which does not cover the identical atoms that a previous match
91     has covered.
92 
93   */
94 
95 #define ATOMPOOL      1
96 #define BONDPOOL      1
97 
98 #define AE_ANDHI        1
99 #define AE_ANDLO        2
100 #define AE_OR           3
101 #define AE_RECUR        4
102 #define AE_NOT          5
103 #define AE_TRUE         6
104 #define AE_FALSE        7
105 #define AE_AROMATIC     8
106 #define AE_ALIPHATIC    9
107 #define AE_CYCLIC       10
108 #define AE_ACYCLIC      11
109 #define AE_MASS         12
110 #define AE_ELEM         13
111 #define AE_AROMELEM     14
112 #define AE_ALIPHELEM    15
113 #define AE_HCOUNT       16
114 #define AE_CHARGE       17
115 #define AE_CONNECT      18
116 #define AE_DEGREE       19
117 #define AE_IMPLICIT     20
118 #define AE_RINGS        21
119 #define AE_SIZE         22
120 #define AE_VALENCE      23
121 #define AE_CHIRAL       24
122 #define AE_HYB          25
123 #define AE_RINGCONNECT  26
124 
125 #define AL_CLOCKWISE      1
126 #define AL_ANTICLOCKWISE  2
127 #define AL_UNSPECIFIED    0
128 
129 /* Each BondExpr node is identified by an integer type field
130    selected from the list of BE_ codes below.  BE_ANDHI,
131    BE_ANDLO and BE_OR are binary nodes of type BondExpr.bin,
132    BE_NOT is unary node of type BondExpr.mon, and the remaining
133    code are all leaf nodes.  */
134 
135 #define BE_ANDHI        1
136 #define BE_ANDLO        2
137 #define BE_OR           3
138 #define BE_NOT          4
139 #define BE_ANY          5
140 #define BE_DEFAULT      6
141 #define BE_SINGLE       7
142 #define BE_DOUBLE       8
143 #define BE_TRIPLE       9
144 #define BE_QUAD         10
145 #define BE_AROM         11
146 #define BE_RING         12
147 #define BE_UP           13
148 #define BE_DOWN         14
149 #define BE_UPUNSPEC     15
150 #define BE_DOWNUNSPEC   16
151 
152 
153   static int CreateAtom(Pattern*,AtomExpr*,int,int vb=0);
154 
155   const int SmartsImplicitRef = -9999; // Used as a placeholder when recording atom nbrs for chiral atoms
156 
157 
158   /*=============================*/
159   /*  Standard Utility Routines  */
160   /*=============================*/
161 
FatalAllocationError(const char * ptr)162   static void FatalAllocationError( const char *ptr )
163   {
164     stringstream errorMsg;
165     errorMsg << "Error: Unable to allocate" << ptr << endl;
166     obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
167   }
168 
169   /*================================*/
170   /*  Atom Expression Manipulation  */
171   /*================================*/
172 
173   static void FreePattern( Pattern* );
174   static Pattern *CopyPattern( Pattern* );
175 
CopyAtomExpr(AtomExpr * expr)176   static AtomExpr *CopyAtomExpr( AtomExpr *expr )
177   {
178     AtomExpr *result;
179 
180     result = new AtomExpr;
181     result->type = expr->type;
182     switch( expr->type )
183       {
184       case AE_ANDHI:
185       case AE_ANDLO:
186       case AE_OR:
187         result->bin.lft = CopyAtomExpr(expr->bin.lft);
188         result->bin.rgt = CopyAtomExpr(expr->bin.rgt);
189         break;
190 
191       case AE_NOT:
192         result->mon.arg = CopyAtomExpr(expr->mon.arg);
193         break;
194 
195       case AE_RECUR:
196         result->recur.recur = CopyPattern((Pattern*)expr->recur.recur);
197         break;
198 
199       default:
200         result->leaf.value = expr->leaf.value;
201         break;
202       }
203     return result;
204   }
205 
FreeAtomExpr(AtomExpr * expr)206   static void FreeAtomExpr( AtomExpr *expr )
207   {
208     if( expr )
209       {
210         switch( expr->type )
211           {
212           case AE_ANDHI:
213           case AE_ANDLO:
214           case AE_OR:
215             FreeAtomExpr(expr->bin.lft);
216             FreeAtomExpr(expr->bin.rgt);
217             break;
218 
219           case AE_NOT:
220             FreeAtomExpr(expr->mon.arg);
221             break;
222 
223           case AE_RECUR:
224             FreePattern((Pattern*)expr->recur.recur);
225             break;
226           }
227 
228         delete expr;
229       }
230   }
231 
BuildAtomPred(int type)232   static AtomExpr *BuildAtomPred( int type )
233   {
234     AtomExpr *result;
235 
236     result = new AtomExpr;
237     result->leaf.type = type;
238     result->leaf.value = 0;
239     return result;
240   }
241 
BuildAtomLeaf(int type,int val)242   static AtomExpr *BuildAtomLeaf( int type, int val )
243   {
244     AtomExpr *result;
245 
246     result = new AtomExpr;
247     result->leaf.type = type;
248     result->leaf.value = val;
249     return result;
250   }
251 
BuildAtomNot(AtomExpr * expr)252   static AtomExpr *BuildAtomNot( AtomExpr *expr )
253   {
254     AtomExpr *result;
255 
256     result = new AtomExpr;
257     result->mon.type = AE_NOT;
258     result->mon.arg = expr;
259     return result;
260   }
261 
BuildAtomBin(int op,AtomExpr * lft,AtomExpr * rgt)262   static AtomExpr *BuildAtomBin( int op, AtomExpr *lft, AtomExpr *rgt )
263   {
264     AtomExpr *result;
265 
266     result = new AtomExpr;
267     result->bin.type = op;
268     result->bin.lft = lft;
269     result->bin.rgt = rgt;
270     return result;
271   }
272 
BuildAtomRecurs(Pattern * pat)273   static AtomExpr *BuildAtomRecurs( Pattern *pat )
274   {
275     AtomExpr *result;
276 
277     result = new AtomExpr;
278     result->recur.type = AE_RECUR;
279     result->recur.recur = (void*)pat;
280     return result;
281   }
282 
GenerateElement(int elem)283   static AtomExpr *GenerateElement( int elem )
284   {
285     return BuildAtomLeaf(AE_ELEM,elem);
286   }
287 
GenerateAromElem(int elem,int flag)288   static AtomExpr *GenerateAromElem( int elem, int flag )
289   {
290     return flag ? BuildAtomLeaf(AE_AROMELEM,elem)
291                 : BuildAtomLeaf(AE_ALIPHELEM,elem);
292   }
293 
294   /*================================*/
295   /*  Bond Expression Manipulation  */
296   /*================================*/
297 
CopyBondExpr(BondExpr * expr)298   static BondExpr *CopyBondExpr( BondExpr *expr )
299   {
300     BondExpr *result;
301 
302     result = new BondExpr;
303     result->type = expr->type;
304     switch( expr->type )
305       {
306       case BE_ANDHI:
307       case BE_ANDLO:
308       case BE_OR:
309         result->bin.lft = CopyBondExpr(expr->bin.lft);
310         result->bin.rgt = CopyBondExpr(expr->bin.rgt);
311         break;
312 
313       case BE_NOT:
314         result->mon.arg = CopyBondExpr(expr->mon.arg);
315         break;
316       }
317     return result;
318   }
319 
320   /**
321    * Check if two BondExpr objects are the same. This is used for ring closures
322    * to identify invalid SMARTS like:
323    *
324    *   C-1CCCCC#1
325    *   C=1CCCCC:1
326    *
327    * However, the SMARTS below are valid and the bond expression next to the the
328    * second closure digit is used.
329    *
330    *   C1CCCCC#1
331    *   C1CCCCC=1
332    */
EquivalentBondExpr(BondExpr * expr1,BondExpr * expr2)333   static bool EquivalentBondExpr( BondExpr *expr1, BondExpr *expr2 )
334   {
335     if (expr1 == nullptr && expr2 == nullptr)
336       return true;
337     if (expr1 == nullptr && expr2 != nullptr)
338       return false;
339     if (expr1 != nullptr && expr2 == nullptr)
340       return false;
341 
342     if (expr1->type != expr2->type)
343       return false;
344 
345     switch( expr1->type )
346       {
347       case BE_ANDHI:
348       case BE_ANDLO:
349       case BE_OR:
350         return EquivalentBondExpr(expr1->bin.lft, expr2->bin.lft) &&
351                EquivalentBondExpr(expr1->bin.rgt, expr2->bin.rgt);
352 
353       case BE_NOT:
354         return EquivalentBondExpr(expr1->mon.arg, expr2->mon.arg);
355       }
356     return true;
357   }
358 
FreeBondExpr(BondExpr * expr)359   static void FreeBondExpr( BondExpr *expr )
360   {
361     if( expr )
362       {
363         switch( expr->type )
364           {
365           case BE_ANDHI:
366           case BE_ANDLO:
367           case BE_OR:
368             FreeBondExpr(expr->bin.lft);
369             FreeBondExpr(expr->bin.rgt);
370             break;
371 
372           case BE_NOT:
373             FreeBondExpr(expr->mon.arg);
374             break;
375           }
376 
377         delete expr;
378       }
379   }
380 
BuildBondLeaf(int type)381   static BondExpr *BuildBondLeaf( int type )
382   {
383     BondExpr *result;
384 
385     result = new BondExpr;
386     result->type = type;
387     return result;
388   }
389 
BuildBondNot(BondExpr * expr)390   static BondExpr *BuildBondNot( BondExpr *expr )
391   {
392     BondExpr *result;
393 
394     result = new BondExpr;
395     result->mon.type = BE_NOT;
396     result->mon.arg = expr;
397     return result;
398   }
399 
BuildBondBin(int op,BondExpr * lft,BondExpr * rgt)400   static BondExpr *BuildBondBin( int op, BondExpr *lft, BondExpr *rgt )
401   {
402     BondExpr *result;
403 
404     result = new BondExpr;
405     result->bin.type = op;
406     result->bin.lft = lft;
407     result->bin.rgt = rgt;
408     return result;
409   }
410 
GenerateDefaultBond(void)411   static BondExpr *GenerateDefaultBond( void )
412   {
413     return BuildBondLeaf(BE_DEFAULT);
414   }
415 
416   /*===============================*/
417   /*  SMARTS Pattern Manipulation  */
418   /*===============================*/
419 
AllocPattern(void)420   static Pattern *AllocPattern( void )
421   {
422     Pattern *ptr;
423 
424     ptr = new Pattern;
425     if( !ptr ) {
426       FatalAllocationError("pattern");
427       return nullptr;
428     }
429 
430     ptr->atom = nullptr;
431     ptr->aalloc = 0;
432     ptr->acount = 0;
433 
434     ptr->bond = nullptr;
435     ptr->balloc = 0;
436     ptr->bcount = 0;
437 
438     ptr->parts = 1;
439 
440     ptr->hasExplicitH=false;
441     return ptr;
442   }
443 
CreateAtom(Pattern * pat,AtomExpr * expr,int part,int vb)444   static int CreateAtom( Pattern *pat, AtomExpr *expr, int part,int vb)
445   {
446     int index,size;
447 
448     if (!pat)
449       return -1; // should never happen
450 
451     if( pat->acount == pat->aalloc )
452       {
453         pat->aalloc += ATOMPOOL;
454         size = (int)(pat->aalloc*sizeof(AtomSpec));
455         if( pat->atom )
456           {
457             AtomSpec *tmp = new AtomSpec[pat->aalloc];
458             copy(pat->atom, pat->atom + pat->aalloc - ATOMPOOL, tmp);
459             delete [] pat->atom;
460             pat->atom = tmp;
461           }
462         else
463           pat->atom = new AtomSpec[pat->aalloc];
464         if( !pat->atom )
465           FatalAllocationError("atom pool");
466       }
467 
468     index = pat->acount++;
469     pat->atom[index].part = part;
470     pat->atom[index].expr = expr;
471     pat->atom[index].vb = vb; //std::vector binding
472 
473     return index;
474   }
475 
CreateBond(Pattern * pat,BondExpr * expr,int src,int dst)476   static int CreateBond( Pattern *pat, BondExpr *expr, int src, int dst )
477   {
478     int index,size;
479 
480     if (!pat)
481       return -1; // should never happen
482 
483     if( pat->bcount == pat->balloc )
484       {
485         pat->balloc += BONDPOOL;
486         size = (int)(pat->balloc*sizeof(BondSpec));
487         if( pat->bond )
488           {
489             BondSpec *tmp = new BondSpec[pat->balloc];
490             copy(pat->bond, pat->bond + pat->balloc - BONDPOOL, tmp);
491             delete [] pat->bond;
492             pat->bond = tmp;
493           }
494         else
495           pat->bond = new BondSpec[pat->balloc];
496         if( !pat->bond )
497           FatalAllocationError("bond pool");
498       }
499 
500     index = pat->bcount++;
501     pat->bond[index].expr = expr;
502     pat->bond[index].src = src;
503     pat->bond[index].dst = dst;
504     return(index);
505   }
506 
CopyPattern(Pattern * pat)507   static Pattern *CopyPattern( Pattern *pat )
508   {
509     Pattern *result;
510     AtomExpr *aexpr;
511     BondExpr *bexpr;
512     int i;
513 
514     result = AllocPattern();
515     result->parts = pat->parts;
516     for( i=0; i<pat->acount; i++ )
517       {
518         aexpr = CopyAtomExpr(pat->atom[i].expr);
519         CreateAtom(result,aexpr,pat->atom[i].part);
520       }
521 
522     for( i=0; i<pat->bcount; i++ )
523       {
524         bexpr = CopyBondExpr(pat->bond[i].expr);
525         CreateBond(result,bexpr,pat->bond[i].src,pat->bond[i].dst);
526       }
527 
528     return result;
529   }
530 
FreePattern(Pattern * pat)531   static void FreePattern( Pattern *pat )
532   {
533     int i;
534 
535     if( pat )
536       {
537         if( pat->aalloc )
538           {
539             for( i=0; i<pat->acount; i++ )
540               FreeAtomExpr(pat->atom[i].expr);
541             if (pat->atom != nullptr) {
542               //free(pat->atom);
543               delete [] pat->atom;
544               pat->atom = nullptr;
545             }
546           }
547 
548         if( pat->balloc )
549           {
550             for( i=0; i<pat->bcount; i++ )
551               FreeBondExpr(pat->bond[i].expr);
552             if (pat->bond != nullptr) {
553               //free(pat->bond);
554               delete [] pat->bond;
555               pat->bond = nullptr;
556             }
557           }
558         delete pat;
559         pat = nullptr;
560       }
561   }
562 
563   /*=========================*/
564   /*  SMARTS Syntax Parsing  */
565   /*=========================*/
566 
567 
SMARTSError(Pattern * pat)568   Pattern *OBSmartsPattern::SMARTSError( Pattern *pat )
569   {
570     stringstream errorMsg;
571     errorMsg << "SMARTS Error:\n" << MainPtr << endl;
572     errorMsg << setw(LexPtr-MainPtr+1) << '^' << endl;
573     obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError, onceOnly);
574 
575     FreePattern(pat);
576     return nullptr;
577   }
578 
ParseSimpleAtomPrimitive(void)579   AtomExpr *OBSmartsPattern::ParseSimpleAtomPrimitive( void )
580   {
581     switch( *LexPtr++ )
582       {
583       case '*':
584         return BuildAtomPred(AE_TRUE);
585       case 'A':
586         return BuildAtomPred(AE_ALIPHATIC);
587       case 'B':
588         if( *LexPtr == 'r' )
589           {
590             LexPtr++;
591             return GenerateElement(35);
592           }
593         return GenerateElement(5);
594       case 'C':
595         if( *LexPtr == 'l' )
596           {
597             LexPtr++;
598             return GenerateElement(17);
599           }
600         return GenerateAromElem(6,false);
601       case 'F':
602         return GenerateElement( 9);
603       case 'I':
604         return GenerateElement(53);
605       case 'N':
606         return GenerateAromElem(7,false);
607       case 'O':
608         return GenerateAromElem(8,false);
609       case 'P':
610         return GenerateAromElem(15, false);
611       case 'S':
612         return GenerateAromElem(16,false);
613       case 'a':
614         if( *LexPtr == 's' )
615           {
616             LexPtr++;
617             return GenerateAromElem(33, true);
618           }
619         return BuildAtomPred(AE_AROMATIC);
620       case 'c':
621         return GenerateAromElem( 6,true);
622       case 'n':
623         return GenerateAromElem( 7,true);
624       case 'o':
625         return GenerateAromElem( 8,true);
626       case 'p':
627         return GenerateAromElem(15,true);
628       case 's':
629         if( *LexPtr == 'e' )
630           {
631             LexPtr++;
632             return GenerateAromElem(34, true);
633           }
634         return GenerateAromElem(16,true);
635       }
636     LexPtr--;
637     return nullptr;
638   }
639 
ParseComplexAtomPrimitive(void)640   AtomExpr *OBSmartsPattern::ParseComplexAtomPrimitive( void )
641   {
642     Pattern *pat;
643     int index;
644 
645     switch( *LexPtr++ )
646       {
647       case('#'):
648         if( !isdigit(*LexPtr) )
649           return nullptr;
650 
651         index = 0;
652         while( isdigit(*LexPtr) )
653           index = index*10 + ((*LexPtr++)-'0');
654         if( index > 255 )
655           {
656             LexPtr--;
657             return nullptr;
658           }
659         return( GenerateElement(index) );
660 
661       case('$'):
662         if( *LexPtr != '(' )
663           return nullptr;
664         LexPtr++;
665         pat = ParseSMARTSPattern();
666 
667         if( !pat )
668           return nullptr;
669         if( *LexPtr != ')' )
670           {
671             FreePattern(pat);
672             return nullptr;
673           }
674         LexPtr++;
675         return( BuildAtomRecurs(pat) );
676 
677       case('*'):
678         return BuildAtomPred(AE_TRUE);
679 
680       case('+'):
681         if( isdigit(*LexPtr) )
682           {
683             index = 0;
684             while( isdigit(*LexPtr) )
685               index = index*10 + ((*LexPtr++)-'0');
686           }
687         else
688           {
689             index = 1;
690             while( *LexPtr == '+' )
691               {
692                 LexPtr++;
693                 index++;
694               }
695           }
696         return BuildAtomLeaf(AE_CHARGE,index);
697 
698       case('-'):
699         if( isdigit(*LexPtr) )
700           {
701             index = 0;
702             while( isdigit(*LexPtr) )
703               index = index*10 + ((*LexPtr++)-'0');
704           }
705         else
706           {
707             index = 1;
708             while( *LexPtr == '-' )
709               {
710                 LexPtr++;
711                 index++;
712               }
713           }
714         return BuildAtomLeaf(AE_CHARGE,-index);
715 
716       case '@':
717         if (*LexPtr == '?')
718           {
719             LexPtr++;
720             return BuildAtomLeaf(AE_CHIRAL,AL_UNSPECIFIED); // unspecified
721           }
722         else if (*LexPtr != '@')
723           return BuildAtomLeaf(AE_CHIRAL,AL_ANTICLOCKWISE);
724         else
725           {
726             LexPtr++;
727             return BuildAtomLeaf(AE_CHIRAL,AL_CLOCKWISE);
728           }
729 
730       case '^':
731         if (isdigit(*LexPtr))
732           {
733             index = 0;
734             while( isdigit(*LexPtr) )
735               index = index*10 + ((*LexPtr++)-'0');
736             return BuildAtomLeaf(AE_HYB,index);
737           }
738         else
739           return BuildAtomLeaf(AE_HYB,1);
740 
741       case('0'): case('1'): case('2'): case('3'): case('4'):
742       case('5'): case('6'): case('7'): case('8'): case('9'):
743         index = LexPtr[-1]-'0';
744         while( isdigit(*LexPtr) )
745           index = index*10 + ((*LexPtr++)-'0');
746         return BuildAtomLeaf(AE_MASS,index);
747 
748       case('A'):
749         switch( *LexPtr++ )
750           {
751           case('c'):  return GenerateElement(89);
752           case('g'):  return GenerateElement(47);
753           case('l'):  return GenerateElement(13);
754           case('m'):  return GenerateElement(95);
755           case('r'):  return GenerateElement(18);
756           case('s'):  return GenerateElement(33);
757           case('t'):  return GenerateElement(85);
758           case('u'):  return GenerateElement(79);
759           }
760         LexPtr--;
761         return BuildAtomPred(AE_ALIPHATIC);
762 
763       case('B'):
764         switch( *LexPtr++ )
765           {
766           case('a'):  return GenerateElement(56);
767           case('e'):  return GenerateElement( 4);
768           case('i'):  return GenerateElement(83);
769           case('k'):  return GenerateElement(97);
770           case('r'):  return GenerateElement(35);
771           }
772         LexPtr--;
773         return GenerateElement(5);
774 
775       case('C'):
776         switch( *LexPtr++ )
777           {
778           case('a'):  return GenerateElement(20);
779           case('d'):  return GenerateElement(48);
780           case('e'):  return GenerateElement(58);
781           case('f'):  return GenerateElement(98);
782           case('l'):  return GenerateElement(17);
783           case('m'):  return GenerateElement(96);
784           case('o'):  return GenerateElement(27);
785           case('r'):  return GenerateElement(24);
786           case('s'):  return GenerateElement(55);
787           case('u'):  return GenerateElement(29);
788           }
789         LexPtr--;
790         return GenerateAromElem(6,false);
791 
792       case('D'):
793         if( *LexPtr == 'y' )
794           {
795             LexPtr++;
796             return GenerateElement(66);
797           }
798         else if( isdigit(*LexPtr) )
799           {
800             index = 0;
801             while( isdigit(*LexPtr) )
802               index = index*10 + ((*LexPtr++)-'0');
803             return BuildAtomLeaf(AE_DEGREE,index);
804           }
805         return BuildAtomLeaf(AE_DEGREE,1);
806 
807       case('E'):
808         if( *LexPtr == 'r' )
809           {
810             LexPtr++;
811             return GenerateElement(68);
812           }
813         else if( *LexPtr == 's' )
814           {
815             LexPtr++;
816             return GenerateElement(99);
817           }
818         else if( *LexPtr == 'u' )
819           {
820             LexPtr++;
821             return GenerateElement(63);
822           }
823         break;
824 
825       case('F'):
826         if( *LexPtr == 'e' )
827           {
828             LexPtr++;
829             return GenerateElement(26);
830           }
831         else if( *LexPtr == 'm' )
832           {
833             LexPtr++;
834             return GenerateElement(100);
835           }
836         else if( *LexPtr == 'r' )
837           {
838             LexPtr++;
839             return GenerateElement(87);
840           }
841         return GenerateElement(9);
842 
843       case('G'):
844         if( *LexPtr == 'a' )
845           {
846             LexPtr++;
847             return( GenerateElement(31) );
848           }
849         else if( *LexPtr == 'd' )
850           {
851             LexPtr++;
852             return( GenerateElement(64) );
853           }
854         else if( *LexPtr == 'e' )
855           {
856             LexPtr++;
857             return( GenerateElement(32) );
858           }
859         break;
860 
861       case('H'):
862         if( *LexPtr == 'e' )
863           {
864             LexPtr++;
865             return( GenerateElement( 2) );
866           }
867         else if( *LexPtr == 'f' )
868           {
869             LexPtr++;
870             return( GenerateElement(72) );
871           }
872         else if( *LexPtr == 'g' )
873           {
874             LexPtr++;
875             return( GenerateElement(80) );
876           }
877         else if( *LexPtr == 'o' )
878           {
879             LexPtr++;
880             return( GenerateElement(67) );
881           }
882         else if( isdigit(*LexPtr) )
883           {
884             index = 0;
885             while( isdigit(*LexPtr) )
886               index = index*10 + ((*LexPtr++)-'0');
887             return BuildAtomLeaf(AE_HCOUNT,index);
888           }
889         return BuildAtomLeaf(AE_HCOUNT,1);
890 
891       case('I'):
892         if( *LexPtr == 'n' )
893           {
894             LexPtr++;
895             return( GenerateElement(49) );
896           }
897         else if( *LexPtr == 'r' )
898           {
899             LexPtr++;
900             return( GenerateElement(77) );
901           }
902         return( GenerateElement(53) );
903 
904       case('K'):
905         if( *LexPtr == 'r' )
906           {
907             LexPtr++;
908             return( GenerateElement(36) );
909           }
910         return( GenerateElement(19) );
911 
912       case('L'):
913         if( *LexPtr == 'a' )
914           {
915             LexPtr++;
916             return( GenerateElement( 57) );
917           }
918         else if( *LexPtr == 'i' )
919           {
920             LexPtr++;
921             return( GenerateElement(  3) );
922           }
923         else if( *LexPtr == 'r' )
924           {
925             LexPtr++;
926             return( GenerateElement(103) );
927           }
928         else if( *LexPtr == 'u' )
929           {
930             LexPtr++;
931             return( GenerateElement( 71) );
932           }
933         break;
934 
935       case('M'):
936         if( *LexPtr == 'd' )
937           {
938             LexPtr++;
939             return( GenerateElement(101) );
940           }
941         else if( *LexPtr == 'g' )
942           {
943             LexPtr++;
944             return( GenerateElement( 12) );
945           }
946         else if( *LexPtr == 'n' )
947           {
948             LexPtr++;
949             return( GenerateElement( 25) );
950           }
951         else if( *LexPtr == 'o' )
952           {
953             LexPtr++;
954             return( GenerateElement( 42) );
955           }
956         break;
957 
958       case('N'):
959         switch( *LexPtr++ )
960           {
961           case('a'):  return( GenerateElement( 11) );
962           case('b'):  return( GenerateElement( 41) );
963           case('d'):  return( GenerateElement( 60) );
964           case('e'):  return( GenerateElement( 10) );
965           case('i'):  return( GenerateElement( 28) );
966           case('o'):  return( GenerateElement(102) );
967           case('p'):  return( GenerateElement( 93) );
968           }
969         LexPtr--;
970         return( GenerateAromElem(7,false) );
971 
972       case('O'):
973         if( *LexPtr == 's' )
974           {
975             LexPtr++;
976             return( GenerateElement(76) );
977           }
978         return( GenerateAromElem(8,false) );
979 
980       case('P'):
981         switch( *LexPtr++ )
982           {
983           case('a'):  return( GenerateElement(91) );
984           case('b'):  return( GenerateElement(82) );
985           case('d'):  return( GenerateElement(46) );
986           case('m'):  return( GenerateElement(61) );
987           case('o'):  return( GenerateElement(84) );
988           case('r'):  return( GenerateElement(59) );
989           case('t'):  return( GenerateElement(78) );
990           case('u'):  return( GenerateElement(94) );
991           }
992         LexPtr--;
993         return( GenerateElement(15) );
994 
995       case('R'):
996         switch( *LexPtr++ )
997           {
998           case('a'):  return( GenerateElement(88) );
999           case('b'):  return( GenerateElement(37) );
1000           case('e'):  return( GenerateElement(75) );
1001           case('h'):  return( GenerateElement(45) );
1002           case('n'):  return( GenerateElement(86) );
1003           case('u'):  return( GenerateElement(44) );
1004           }
1005         LexPtr--;
1006         if( isdigit(*LexPtr) )
1007           {
1008             index = 0;
1009             while( isdigit(*LexPtr) )
1010               index = index*10 + ((*LexPtr++)-'0');
1011             if( index == 0 )
1012               return BuildAtomPred(AE_ACYCLIC);
1013             return BuildAtomLeaf(AE_RINGS,index);
1014           }
1015         return BuildAtomPred(AE_CYCLIC);
1016 
1017       case('S'):
1018         switch( *LexPtr++ )
1019           {
1020           case('b'):  return( GenerateElement(51) );
1021           case('c'):  return( GenerateElement(21) );
1022           case('e'):  return( GenerateElement(34) );
1023           case('i'):  return( GenerateElement(14) );
1024           case('m'):  return( GenerateElement(62) );
1025           case('n'):  return( GenerateElement(50) );
1026           case('r'):  return( GenerateElement(38) );
1027           }
1028         LexPtr--;
1029         return( GenerateAromElem(16,false) );
1030 
1031       case('T'):
1032         switch( *LexPtr++ )
1033           {
1034           case('a'):  return( GenerateElement(73) );
1035           case('b'):  return( GenerateElement(65) );
1036           case('c'):  return( GenerateElement(43) );
1037           case('e'):  return( GenerateElement(52) );
1038           case('h'):  return( GenerateElement(90) );
1039           case('i'):  return( GenerateElement(22) );
1040           case('l'):  return( GenerateElement(81) );
1041           case('m'):  return( GenerateElement(69) );
1042           }
1043         LexPtr--;
1044         break;
1045 
1046       case('U'):  return( GenerateElement(92) );
1047       case('V'):  return( GenerateElement(23) );
1048       case('W'):  return( GenerateElement(74) );
1049 
1050       case('X'):
1051         if( *LexPtr == 'e' )
1052           {
1053             LexPtr++;
1054             return( GenerateElement(54) );
1055           }
1056         else if( isdigit(*LexPtr) )
1057           {
1058             index = 0;
1059             while( isdigit(*LexPtr) )
1060               index = index*10 + ((*LexPtr++)-'0');
1061             if (index == 0) // default to 1 (if no number present)
1062               index = 1;
1063             return BuildAtomLeaf(AE_CONNECT,index);
1064           }
1065         return BuildAtomLeaf(AE_CONNECT,1);
1066 
1067       case('Y'):
1068         if( *LexPtr == 'b' )
1069           {
1070             LexPtr++;
1071             return( GenerateElement(70) );
1072           }
1073         return( GenerateElement(39) );
1074 
1075       case('Z'):
1076         if( *LexPtr == 'n' )
1077           {
1078             LexPtr++;
1079             return GenerateElement(30);
1080           }
1081         else if( *LexPtr == 'r' )
1082           {
1083             LexPtr++;
1084             return GenerateElement(40);
1085           }
1086         break;
1087 
1088       case('a'):
1089         if( *LexPtr == 's' )
1090           {
1091             LexPtr++;
1092             return GenerateAromElem(33,true);
1093           }
1094         return BuildAtomPred(AE_AROMATIC);
1095 
1096       case('c'):
1097         return GenerateAromElem(6,true);
1098 
1099       case('h'):
1100         if( isdigit(*LexPtr) )
1101           {
1102             index = 0;
1103             while( isdigit(*LexPtr) )
1104               index = index*10 + ((*LexPtr++)-'0');
1105           }
1106         else
1107           index = 1;
1108         return BuildAtomLeaf(AE_IMPLICIT,index);
1109 
1110       case('n'):  return GenerateAromElem(7,true);
1111       case('o'):  return GenerateAromElem(8,true);
1112       case('p'):  return GenerateAromElem(15,true);
1113 
1114       case('r'):
1115         if( isdigit(*LexPtr) )
1116           {
1117             index = 0;
1118             while( isdigit(*LexPtr) )
1119               index = index*10 + ((*LexPtr++)-'0');
1120             if( index == 0 )
1121               return BuildAtomPred(AE_ACYCLIC);
1122             return BuildAtomLeaf(AE_SIZE,index);
1123           }
1124         return BuildAtomPred(AE_CYCLIC);
1125 
1126       case('s'):
1127         if( *LexPtr == 'e' )
1128           {
1129             LexPtr++;
1130             return GenerateAromElem(34,true);
1131           }
1132         return GenerateAromElem(16,true);
1133 
1134       case('v'):
1135         if( isdigit(*LexPtr) )
1136           {
1137             index = 0;
1138             while( isdigit(*LexPtr) )
1139               index = index*10 + ((*LexPtr++)-'0');
1140             return BuildAtomLeaf(AE_VALENCE,index);
1141           }
1142         return BuildAtomLeaf(AE_VALENCE,1);
1143 
1144       case('x'):
1145         if( isdigit(*LexPtr) )
1146           {
1147             index = 0;
1148             while( isdigit(*LexPtr) )
1149               index = index*10 + ((*LexPtr++)-'0');
1150             return BuildAtomLeaf(AE_RINGCONNECT,index);
1151           }
1152         return BuildAtomPred(AE_CYCLIC);
1153       }
1154     LexPtr--;
1155     return nullptr;
1156   }
1157 
ParseAtomExpr(int level)1158   AtomExpr *OBSmartsPattern::ParseAtomExpr( int level )
1159   {
1160     AtomExpr *expr1 = nullptr;
1161     AtomExpr *expr2 = nullptr;
1162     char *prev;
1163 
1164     switch( level )
1165       {
1166       case(0): /* Low Precedence Conjunction */
1167         if( !(expr1=ParseAtomExpr(1)) )
1168           return nullptr;
1169 
1170         while( *LexPtr == ';' )
1171           {
1172             LexPtr++;
1173             if( !(expr2=ParseAtomExpr(1)) )
1174               {
1175                 FreeAtomExpr(expr1);
1176                 return nullptr;
1177               }
1178             expr1 = BuildAtomBin(AE_ANDLO,expr1,expr2);
1179           }
1180         return expr1;
1181 
1182       case(1): /* Disjunction */
1183         if( !(expr1=ParseAtomExpr(2)) )
1184           return nullptr;
1185 
1186         while( *LexPtr == ',' )
1187           {
1188             LexPtr++;
1189             if( !(expr2=ParseAtomExpr(2)) )
1190               {
1191                 FreeAtomExpr(expr1);
1192                 return nullptr;
1193               }
1194             expr1 = BuildAtomBin(AE_OR,expr1,expr2);
1195           }
1196         return( expr1 );
1197 
1198       case(2): /* High Precedence Conjunction */
1199         if( !(expr1=ParseAtomExpr(3)) )
1200           return nullptr;
1201 
1202         while( (*LexPtr!=']') && (*LexPtr!=';') &&
1203                (*LexPtr!=',') && *LexPtr )
1204           {
1205             if( *LexPtr=='&' )
1206               LexPtr++;
1207             prev = LexPtr;
1208             if( !(expr2=ParseAtomExpr(3)) )
1209               {
1210                 if( prev != LexPtr )
1211                   {
1212                     FreeAtomExpr(expr1);
1213                     return nullptr;
1214                   }
1215                 else
1216                   return( expr1 );
1217               }
1218             expr1 = BuildAtomBin(AE_ANDHI,expr1,expr2);
1219           }
1220         return( expr1 );
1221 
1222       case(3): /* Negation or Primitive */
1223         if( *LexPtr == '!' )
1224           {
1225             LexPtr++;
1226             if( !(expr1=ParseAtomExpr(3)) )
1227               return nullptr;
1228             return( BuildAtomNot(expr1) );
1229           }
1230         return( ParseComplexAtomPrimitive() );
1231       }
1232     return nullptr;
1233   }
1234 
ParseBondPrimitive(void)1235   BondExpr *OBSmartsPattern::ParseBondPrimitive( void )
1236   {
1237     char bsym = *LexPtr++;
1238 
1239     switch(bsym)
1240       {
1241       case '-':  return BuildBondLeaf(BE_SINGLE);
1242       case '=':  return BuildBondLeaf(BE_DOUBLE);
1243       case '#':  return BuildBondLeaf(BE_TRIPLE);
1244       case '$':  return BuildBondLeaf(BE_QUAD);
1245       case ':':  return BuildBondLeaf(BE_AROM);
1246       case '@':  return BuildBondLeaf(BE_RING);
1247       case '~':  return BuildBondLeaf(BE_ANY);
1248 
1249       // return BuildBondLeaf(*LexPtr == '?' ? BE_UPUNSPEC : BE_UP);
1250       case '/':  return BuildBondLeaf(BE_SINGLE);
1251       // return BuildBondLeaf(*LexPtr == '?' ? BE_DOWNUNSPEC : BE_DOWN);
1252       case '\\': return BuildBondLeaf(BE_SINGLE);
1253       }
1254     LexPtr--;
1255     return nullptr;
1256   }
1257 
ParseBondExpr(int level)1258   BondExpr *OBSmartsPattern::ParseBondExpr( int level )
1259   {
1260     BondExpr *expr1 = nullptr;
1261     BondExpr *expr2 = nullptr;
1262     char *prev;
1263 
1264     switch( level )
1265       {
1266       case(0): /* Low Precedence Conjunction */
1267         if( !(expr1=ParseBondExpr(1)) )
1268           return nullptr;
1269 
1270         while( *LexPtr == ';' )
1271           {
1272             LexPtr++;
1273             if( !(expr2=ParseBondExpr(1)) )
1274               {
1275                 FreeBondExpr(expr1);
1276                 return nullptr;
1277               }
1278             expr1 = BuildBondBin(BE_ANDLO,expr1,expr2);
1279           }
1280         return expr1;
1281 
1282       case(1): /* Disjunction */
1283         if( !(expr1=ParseBondExpr(2)) )
1284           return nullptr;
1285 
1286         while( *LexPtr == ',' )
1287           {
1288             LexPtr++;
1289             if( !(expr2=ParseBondExpr(2)) )
1290               {
1291                 FreeBondExpr(expr1);
1292                 return nullptr;
1293               }
1294             expr1 = BuildBondBin(BE_OR,expr1,expr2);
1295           }
1296         return expr1;
1297 
1298       case(2): /* High Precedence Conjunction */
1299         if( !(expr1=ParseBondExpr(3)) )
1300           return nullptr;
1301 
1302         while( (*LexPtr!=']') && (*LexPtr!=';') &&
1303                (*LexPtr!=',') && *LexPtr )
1304           {
1305             if( *LexPtr == '&' )
1306               LexPtr++;
1307             prev = LexPtr;
1308             if( !(expr2=ParseBondExpr(3)) )
1309               {
1310                 if( prev != LexPtr )
1311                   {
1312                     FreeBondExpr(expr1);
1313                     return nullptr;
1314                   }
1315                 else
1316                   return expr1;
1317               }
1318             expr1 = BuildBondBin(BE_ANDHI,expr1,expr2);
1319           }
1320         return expr1;
1321 
1322       case(3): /* Negation or Primitive */
1323         if( *LexPtr == '!' )
1324           {
1325             LexPtr++;
1326             if( !(expr1=ParseBondExpr(3)) )
1327               return nullptr;
1328             return BuildBondNot(expr1);
1329           }
1330         return ParseBondPrimitive();
1331       }
1332     return nullptr;
1333   }
1334 
GetVectorBinding()1335   int OBSmartsPattern::GetVectorBinding()
1336   {
1337     int vb=0;
1338 
1339     LexPtr++; //skip colon
1340     if(isdigit(*LexPtr))
1341       {
1342         vb = 0;
1343         while( isdigit(*LexPtr) )
1344           vb = vb*10 + ((*LexPtr++)-'0');
1345       }
1346 
1347     return(vb);
1348   }
1349 
ParseSMARTSError(Pattern * pat,BondExpr * expr)1350   Pattern *OBSmartsPattern::ParseSMARTSError( Pattern *pat, BondExpr *expr )
1351   {
1352     if( expr )
1353       FreeBondExpr(expr);
1354     return SMARTSError(pat);
1355   }
1356 
SMARTSParser(Pattern * pat,ParseState * stat,int prev,int part)1357   Pattern *OBSmartsPattern::SMARTSParser( Pattern *pat, ParseState *stat,
1358                                 int prev, int part )
1359   {
1360     int vb = 0;
1361     AtomExpr *aexpr;
1362     BondExpr *bexpr;
1363     int index;
1364 
1365     bexpr = nullptr;
1366 
1367     while( *LexPtr )
1368       {
1369         switch( *LexPtr++ )
1370           {
1371           case('.'):
1372             return ParseSMARTSError(pat,bexpr);
1373 
1374           case('-'):  case('='):  case('#'): case('$'):
1375           case(':'):  case('~'):  case('@'):
1376           case('/'):  case('\\'): case('!'):
1377             LexPtr--;
1378             if( (prev==-1) || bexpr )
1379               return ParseSMARTSError(pat,bexpr);
1380             if( !(bexpr=ParseBondExpr(0)) )
1381               return ParseSMARTSError(pat,bexpr);
1382             break;
1383 
1384           case('('):
1385             if( bexpr )
1386               {
1387                 LexPtr--;
1388                 return ParseSMARTSError(pat,bexpr);
1389               }
1390             if( prev == -1 )
1391               {
1392                 index = pat->acount;
1393                 pat = SMARTSParser(pat,stat,-1,part);
1394                 if( !pat )
1395                   return nullptr;
1396                 if( index == pat->acount )
1397                   return ParseSMARTSError(pat,bexpr);
1398                 prev = index;
1399               }
1400             else
1401               {
1402                 pat = SMARTSParser(pat,stat,prev,part);
1403                 if( !pat )
1404                   return nullptr;
1405               }
1406 
1407             if( *LexPtr != ')' )
1408               return ParseSMARTSError(pat,bexpr);
1409             LexPtr++;
1410             break;
1411 
1412           case(')'):  LexPtr--;
1413             if( (prev==-1) || bexpr )
1414               return ParseSMARTSError(pat,bexpr);
1415             return pat;
1416 
1417           case('%'):  if( prev == -1 )
1418               {
1419                 LexPtr--;
1420                 return ParseSMARTSError(pat,bexpr);
1421               }
1422 
1423             if( isdigit(LexPtr[0]) && isdigit(LexPtr[1]) )
1424               {
1425                 index = 10*(LexPtr[0]-'0') + (LexPtr[1]-'0');
1426                 LexPtr += 2;
1427               }
1428             else
1429               return ParseSMARTSError(pat,bexpr);
1430 
1431             if( stat->closure[index] == -1 )
1432               {
1433                 stat->closord[index] = bexpr;
1434                 stat->closure[index] = prev;
1435               }
1436             else if( stat->closure[index] != prev )
1437               {
1438                 if( !bexpr ) {
1439                   if (!stat->closord[index]) {
1440                     bexpr = GenerateDefaultBond();
1441                     FreeBondExpr(stat->closord[index]);
1442                   } else
1443                     bexpr = stat->closord[index];
1444                 } else if (stat->closord[index] && !EquivalentBondExpr(bexpr, stat->closord[index]))
1445                   return ParseSMARTSError(pat,bexpr);
1446 
1447                 CreateBond(pat,bexpr,prev,stat->closure[index]);
1448                 stat->closure[index] = -1;
1449                 bexpr = nullptr;
1450               }
1451             else
1452               return ParseSMARTSError(pat,bexpr);
1453             break;
1454 
1455           case('0'):  case('1'):  case('2'):
1456           case('3'):  case('4'):  case('5'):
1457           case('6'):  case('7'):  case('8'):
1458           case('9'):  LexPtr--;
1459             if( prev == -1 )
1460               return ParseSMARTSError(pat,bexpr);
1461             index = (*LexPtr++)-'0';
1462 
1463             if( stat->closure[index] == -1 )
1464               { // Ring opening
1465                 stat->closord[index] = bexpr;
1466                 stat->closure[index] = prev;
1467                 pat->atom[prev].nbrs.push_back(-index); // Store the BC idx as a -ve
1468                 bexpr = nullptr;
1469               }
1470             else if( stat->closure[index] != prev )
1471               { // Ring closure
1472                 if( !bexpr ) {
1473                   if (!stat->closord[index]) {
1474                     bexpr = GenerateDefaultBond();
1475                     FreeBondExpr(stat->closord[index]);
1476                   } else
1477                     bexpr = stat->closord[index];
1478                 } else if (stat->closord[index] && !EquivalentBondExpr(bexpr, stat->closord[index]))
1479                   return ParseSMARTSError(pat,bexpr);
1480 
1481                 CreateBond(pat,bexpr,prev,stat->closure[index]);
1482                 pat->atom[prev].nbrs.push_back(stat->closure[index]);
1483                 for (unsigned int nbr_idx=0; nbr_idx < pat->atom[stat->closure[index]].nbrs.size(); ++nbr_idx) {
1484                   if (pat->atom[stat->closure[index]].nbrs[nbr_idx] == -index)
1485                     pat->atom[stat->closure[index]].nbrs[nbr_idx] = prev;
1486                 }
1487                 stat->closure[index] = -1;
1488                 bexpr = nullptr;
1489               }
1490             else
1491               return ParseSMARTSError(pat,bexpr);
1492             break;
1493 
1494           case('['):
1495             // shortcut for '[H]' primitive (PR#1463791)
1496             if (*LexPtr == 'H' && *(LexPtr+1) == ']')
1497               {
1498                 aexpr = GenerateElement(1);
1499                 LexPtr++; // skip the 'H'
1500                 pat->hasExplicitH = true;
1501               }
1502             else
1503               aexpr = ParseAtomExpr(0);
1504             vb = (*LexPtr == ':') ? GetVectorBinding():0;
1505             if( !aexpr || (*LexPtr!=']') )
1506               return ParseSMARTSError(pat,bexpr);
1507             index = CreateAtom(pat,aexpr,part,vb);
1508             if( prev != -1 )
1509               {
1510                 if( !bexpr )
1511                   bexpr = GenerateDefaultBond();
1512                 CreateBond(pat,bexpr,prev,index);
1513                 pat->atom[index].nbrs.push_back(prev);
1514                 pat->atom[prev].nbrs.push_back(index);
1515                 bexpr = nullptr;
1516               }
1517             if(*(LexPtr-1) == 'H' && *(LexPtr-2) == '@' ) { // i.e. [C@H] or [C@@H]
1518               pat->atom[index].nbrs.push_back(SmartsImplicitRef);
1519             }
1520             prev = index;
1521             LexPtr++;
1522             break;
1523 
1524           default:
1525             LexPtr--;
1526             aexpr = ParseSimpleAtomPrimitive();
1527             if( !aexpr )
1528               return ParseSMARTSError(pat,bexpr);
1529             index = CreateAtom(pat,aexpr,part);
1530             if( prev != -1 )
1531               {
1532                 if( !bexpr )
1533                   bexpr = GenerateDefaultBond();
1534                 CreateBond(pat,bexpr,prev,index);
1535                 pat->atom[index].nbrs.push_back(prev);
1536                 pat->atom[prev].nbrs.push_back(index);
1537                 bexpr = nullptr;
1538               }
1539             prev = index;
1540           }
1541       }
1542 
1543     if( (prev==-1) || bexpr )
1544       return ParseSMARTSError(pat,bexpr);
1545 
1546     return pat;
1547   }
1548 
MarkGrowBonds(Pattern * pat)1549   static void MarkGrowBonds(Pattern *pat)
1550   {
1551     int i;
1552     OBBitVec bv;
1553 
1554     for (i = 0;i < pat->bcount;++i)
1555       {
1556         pat->bond[i].grow = (bv[pat->bond[i].src] && bv[pat->bond[i].dst])?
1557           false:true;
1558 
1559         bv.SetBitOn(pat->bond[i].src);
1560         bv.SetBitOn(pat->bond[i].dst);
1561       }
1562   }
1563 
GetChiralFlag(AtomExpr * expr)1564   static int GetChiralFlag(AtomExpr *expr)
1565   {
1566     int tmp1,tmp2;
1567 
1568     switch (expr->type)
1569       {
1570         case AE_CHIRAL:
1571           return expr->leaf.value;
1572 
1573         case AE_ANDHI:
1574         case AE_ANDLO:
1575           tmp1 = GetChiralFlag(expr->bin.lft);
1576           tmp2 = GetChiralFlag(expr->bin.rgt);
1577           if (tmp1 == 0) return tmp2;
1578           if (tmp2 == 0) return tmp1;
1579           if (tmp1 == tmp2) return tmp1;
1580           break;
1581 
1582         case AE_OR:
1583           tmp1 = GetChiralFlag(expr->bin.lft);
1584           tmp2 = GetChiralFlag(expr->bin.rgt);
1585           if (tmp1 == 0 || tmp2 == 0) return 0;
1586           if (tmp1 == tmp2) return tmp1;
1587           break;
1588 
1589         case AE_NOT:
1590           // Treat [!@] as [@@], and [!@@] as [@]
1591           tmp1 = GetChiralFlag(expr->mon.arg);
1592           if (tmp1 == AL_ANTICLOCKWISE) return AL_CLOCKWISE;
1593           if (tmp1 == AL_CLOCKWISE) return AL_ANTICLOCKWISE;
1594           break;
1595       }
1596 
1597     return 0;
1598   }
1599 
ParseSMARTSPart(Pattern * result,int part)1600   Pattern *OBSmartsPattern::ParseSMARTSPart( Pattern *result, int part )
1601   {
1602     ParseState stat;
1603     int i,flag;
1604 
1605     for( i=0; i<100; i++ )
1606       stat.closure[i] = -1;
1607 
1608     result = SMARTSParser(result,&stat,-1,part);
1609 
1610     flag = false;
1611     for( i=0; i<100; i++ )
1612       if( stat.closure[i] != -1 )
1613         {
1614           FreeBondExpr(stat.closord[i]);
1615           flag = true;
1616         }
1617 
1618     if( result )
1619       {
1620         if( flag )
1621           return(SMARTSError(result));
1622         else
1623           {
1624             MarkGrowBonds(result);
1625             result->ischiral = false;
1626             for (i = 0;i < result->acount;++i)
1627               {
1628                 result->atom[i].chiral_flag = GetChiralFlag(result->atom[i].expr);
1629                 if (result->atom[i].chiral_flag)
1630                   result->ischiral = true;
1631               }
1632             return(result);
1633           }
1634       }
1635     else
1636       return nullptr;
1637   }
1638 
1639 
ParseSMARTSPattern(void)1640   Pattern *OBSmartsPattern::ParseSMARTSPattern( void )
1641   {
1642     Pattern *result;
1643     result = AllocPattern();
1644 
1645     while( *LexPtr == '(' )
1646       {
1647         if (!result)
1648           return nullptr; // ensure we don't get a null dereference
1649 
1650         LexPtr++;
1651         result = ParseSMARTSPart(result,result->parts);
1652         if( !result )
1653           return nullptr;
1654         result->parts++;
1655 
1656         if( *LexPtr != ')' )
1657           return SMARTSError(result);
1658         LexPtr++;
1659 
1660         if( !*LexPtr || (*LexPtr==')') )
1661           return result;
1662 
1663         if( *LexPtr != '.' )
1664           return SMARTSError(result);
1665 
1666         // Here's where we'd handle fragments
1667         //        cerr << " conjunction " << LexPtr[0] << endl;
1668         LexPtr++;
1669       }
1670 
1671     return ParseSMARTSPart(result,0);
1672   }
1673 
ParseSMARTSString(char * ptr)1674   Pattern *OBSmartsPattern::ParseSMARTSString( char *ptr )
1675   {
1676     Pattern *result;
1677 
1678     if( !ptr || !*ptr )
1679       return nullptr;
1680 
1681     LexPtr = MainPtr = ptr;
1682     result = ParseSMARTSPattern();
1683     if( result && *LexPtr )
1684       return SMARTSError(result);
1685     return result;
1686   }
1687 
ParseSMARTSRecord(char * ptr)1688   Pattern *OBSmartsPattern::ParseSMARTSRecord( char *ptr )
1689   {
1690     char *src;
1691 
1692     src = ptr;
1693     while( *src && !isspace(*src) )
1694       src++;
1695 
1696     if( isspace(*src) )
1697       {
1698         *src++ = '\0';
1699         while( isspace(*src) )
1700           src++;
1701       }
1702 
1703     return ParseSMARTSString(ptr);
1704   }
1705 
1706   //**********************************
1707   //********Pattern Matching**********
1708   //**********************************
1709 
Init(const char * buffer)1710   bool OBSmartsPattern::Init(const char *buffer)
1711   {
1712     if (_buffer != nullptr)
1713       delete[] _buffer;
1714     _buffer = new char[strlen(buffer) + 1];
1715     strcpy(_buffer,buffer);
1716 
1717     if (_pat != nullptr)
1718       FreePattern(_pat);
1719     _pat = ParseSMARTSRecord(_buffer);
1720     _str = _buffer;
1721 
1722     return _pat != nullptr;
1723   }
1724 
Init(const std::string & s)1725   bool OBSmartsPattern::Init(const std::string &s)
1726   {
1727     if (_buffer != nullptr)
1728       delete[] _buffer;
1729     _buffer = new char[s.length() + 1];
1730     strcpy(_buffer, s.c_str());
1731 
1732     if (_pat != nullptr)
1733       FreePattern(_pat);
1734     _pat = ParseSMARTSRecord(_buffer);
1735     _str = s;
1736 
1737     return _pat != nullptr;
1738   }
1739 
~OBSmartsPattern()1740   OBSmartsPattern::~OBSmartsPattern()
1741   {
1742     if (_pat)
1743       FreePattern(_pat);
1744     if(_buffer)
1745     	delete [] _buffer;
1746   }
1747 
Match(OBMol & mol,bool single)1748   bool OBSmartsPattern::Match(OBMol &mol,bool single)
1749   {
1750 	OBSmartsMatcher matcher;
1751 	if (_pat == nullptr)
1752       return false;
1753     if(_pat->hasExplicitH) //The SMARTS pattern contains [H]
1754       {
1755         //Do matching on a copy of mol with explicit hydrogens
1756         OBMol tmol = mol;
1757         tmol.AddHydrogens(false,false);
1758         return(matcher.match(tmol,_pat,_mlist,single));
1759       }
1760     return(matcher.match(mol,_pat,_mlist,single));
1761   }
1762 
HasMatch(OBMol & mol) const1763   bool OBSmartsPattern::HasMatch(OBMol &mol) const
1764   {
1765 	  //a convenience function
1766 	  std::vector<std::vector<int> > dummy;
1767 	  return Match(mol, dummy, Single);
1768   }
1769 
Match(OBMol & mol,std::vector<std::vector<int>> & mlist,MatchType mtype) const1770   bool OBSmartsPattern::Match(OBMol &mol, std::vector<std::vector<int> > & mlist,
1771 		  MatchType mtype /*=All*/) const
1772   {
1773 	OBSmartsMatcher matcher;
1774 	mlist.clear();
1775 	if (_pat == nullptr)
1776       return false;
1777     if(_pat->hasExplicitH) //The SMARTS pattern contains [H]
1778       {
1779         //Do matching on a copy of mol with explicit hydrogens
1780         OBMol tmol = mol;
1781         tmol.AddHydrogens(false,false);
1782         if(!matcher.match(tmol,_pat,mlist,mtype == Single))
1783         	return false;
1784       }
1785     else if(!matcher.match(mol,_pat,mlist,mtype == Single))
1786     	return false;
1787 
1788     if((mtype == AllUnique) && mlist.size() > 1)
1789     {
1790     	//uniquify
1791          bool ok;
1792         OBBitVec bv;
1793         std::vector<OBBitVec> vbv;
1794         std::vector<std::vector<int> > ulist;
1795         std::vector<std::vector<int> >::iterator i;
1796         std::vector<OBBitVec>::iterator j;
1797 
1798         for (i = mlist.begin();i != mlist.end();++i)
1799           {
1800             ok = true;
1801             bv.Clear();
1802             bv.FromVecInt(*i);
1803             for (j = vbv.begin();j != vbv.end() && ok;++j)
1804               if ((*j) == bv)
1805                 ok = false;
1806 
1807             if (ok)
1808               {
1809                 ulist.push_back(*i);
1810                 vbv.push_back(bv);
1811               }
1812           }
1813 
1814         mlist = ulist;
1815     }
1816     return true;
1817   }
1818 
1819 
RestrictedMatch(OBMol & mol,std::vector<std::pair<int,int>> & pr,bool single)1820   bool OBSmartsPattern::RestrictedMatch(OBMol &mol,
1821                                         std::vector<std::pair<int,int> > &pr,
1822                                         bool single)
1823   {
1824     bool ok;
1825     std::vector<std::vector<int> > mlist;
1826     std::vector<std::vector<int> >::iterator i;
1827     std::vector<std::pair<int,int> >::iterator j;
1828 
1829     OBSmartsMatcher matcher;
1830     matcher.match(mol,_pat,mlist);
1831     _mlist.clear();
1832     if (mlist.empty())
1833       return(false);
1834 
1835     for (i = mlist.begin();i != mlist.end();++i)
1836       {
1837         ok = true;
1838         for (j = pr.begin();j != pr.end() && ok;++j)
1839           if ((*i)[j->first] != j->second)
1840             ok = false;
1841 
1842         if (ok)
1843           _mlist.push_back(*i);
1844         if (single && !_mlist.empty())
1845           return(true);
1846       }
1847 
1848     return((_mlist.empty()) ? false:true);
1849   }
1850 
RestrictedMatch(OBMol & mol,OBBitVec & vres,bool single)1851   bool OBSmartsPattern::RestrictedMatch(OBMol &mol,OBBitVec &vres, bool single)
1852   {
1853     bool ok;
1854     std::vector<int>::iterator j;
1855     std::vector<std::vector<int> > mlist;
1856     std::vector<std::vector<int> >::iterator i;
1857 
1858     OBSmartsMatcher matcher;
1859     matcher.match(mol,_pat,mlist);
1860 
1861     _mlist.clear();
1862     if (mlist.empty())
1863       return(false);
1864 
1865     for (i = mlist.begin();i != mlist.end();++i)
1866       {
1867         ok = true;
1868         for (j = i->begin();j != i->end();++j)
1869           if (!vres[*j])
1870             {
1871               ok = false;
1872               break;
1873             }
1874         if (!ok)
1875           continue;
1876 
1877         _mlist.push_back(*i);
1878         if (single && !_mlist.empty())
1879           return(true);
1880       }
1881 
1882     return((_mlist.empty()) ? false:true);
1883   }
1884 
SetupAtomMatchTable(std::vector<std::vector<bool>> & ttab,const Pattern * pat,OBMol & mol)1885   void OBSmartsMatcher::SetupAtomMatchTable(std::vector<std::vector<bool> > &ttab,
1886                            const Pattern *pat, OBMol &mol)
1887   {
1888     int i;
1889 
1890     ttab.resize(pat->acount);
1891     for (i = 0;i < pat->acount;++i)
1892       ttab[i].resize(mol.NumAtoms()+1);
1893 
1894     OBAtom *atom;
1895     std::vector<OBAtom*>::iterator j;
1896     for (i = 0;i < pat->acount;++i)
1897       for (atom = mol.BeginAtom(j);atom;atom = mol.NextAtom(j))
1898         if (EvalAtomExpr(pat->atom[0].expr,atom))
1899           ttab[i][atom->GetIdx()] = true;
1900   }
1901 
FastSingleMatch(OBMol & mol,const Pattern * pat,std::vector<std::vector<int>> & mlist)1902   void OBSmartsMatcher::FastSingleMatch(OBMol &mol, const Pattern *pat,
1903                               std::vector<std::vector<int> > &mlist)
1904   {
1905     OBAtom *atom,*a1,*nbr;
1906     std::vector<OBAtom*>::iterator i;
1907 
1908     OBBitVec bv(mol.NumAtoms()+1);
1909     std::vector<int> map;
1910     map.resize(pat->acount);
1911     std::vector<std::vector<OBBond*>::iterator> vi;
1912     std::vector<bool> vif;
1913 
1914     if (pat->bcount)
1915       {
1916         vif.resize(pat->bcount);
1917         vi.resize(pat->bcount);
1918       }
1919 
1920     int bcount;
1921     for (atom = mol.BeginAtom(i);atom;atom=mol.NextAtom(i))
1922       if (EvalAtomExpr(pat->atom[0].expr,atom))
1923         {
1924           map[0] = atom->GetIdx();
1925           if (pat->bcount)
1926             vif[0] = false;
1927           bv.Clear();
1928           bv.SetBitOn(atom->GetIdx());
1929 
1930           for (bcount=0;bcount >=0;)
1931             {
1932               //***entire pattern matched***
1933               if (bcount == pat->bcount) //save full match here
1934                 {
1935                   mlist.push_back(map);
1936                   bcount--;
1937                   return; //found a single match
1938                 }
1939 
1940               //***match the next bond***
1941               if (!pat->bond[bcount].grow) //just check bond here
1942                 {
1943                   if ( !vif[bcount] )
1944                     {
1945                       OBBond *bond = mol.GetBond(map[pat->bond[bcount].src],
1946                                                  map[pat->bond[bcount].dst]);
1947                       if (bond && EvalBondExpr(pat->bond[bcount].expr,bond))
1948                         {
1949                           vif[bcount++] = true;
1950                           if (bcount < pat->bcount)
1951                             vif[bcount] = false;
1952                         }
1953                       else
1954                         bcount--;
1955                     }
1956                   else //bond must have already been visited - backtrack
1957                     bcount--;
1958                 }
1959               else //need to map atom and check bond
1960                 {
1961                   a1 = mol.GetAtom(map[pat->bond[bcount].src]);
1962 
1963                   if (!vif[bcount]) //figure out which nbr atom we are mapping
1964                     {
1965                       nbr = a1->BeginNbrAtom(vi[bcount]);
1966                     }
1967                   else
1968                     {
1969                       bv.SetBitOff(map[pat->bond[bcount].dst]);
1970                       nbr = a1->NextNbrAtom(vi[bcount]);
1971                     }
1972 
1973                   for (;nbr;nbr=a1->NextNbrAtom(vi[bcount]))
1974                     if (!bv[nbr->GetIdx()])
1975                       if (EvalAtomExpr(pat->atom[pat->bond[bcount].dst].expr,nbr)
1976                           && EvalBondExpr(pat->bond[bcount].expr,(OBBond *)*(vi[bcount])))
1977                         {
1978                           bv.SetBitOn(nbr->GetIdx());
1979                           map[pat->bond[bcount].dst] = nbr->GetIdx();
1980                           vif[bcount] = true;
1981                           bcount++;
1982                           if (bcount < pat->bcount)
1983                             vif[bcount] = false;
1984                           break;
1985                         }
1986 
1987                   if (!nbr)//no match - time to backtrack
1988                     bcount--;
1989                 }
1990             }
1991         }
1992   }
1993 
1994 
match(OBMol & mol,const Pattern * pat,std::vector<std::vector<int>> & mlist,bool single)1995   bool OBSmartsMatcher::match(OBMol &mol, const Pattern *pat,
1996                     std::vector<std::vector<int> > &mlist,bool single)
1997   {
1998     mlist.clear();
1999     if (!pat || pat->acount == 0)
2000       return(false);//shouldn't ever happen
2001 
2002     if (single && !pat->ischiral) {
2003       // perform a fast single match (only works for non-chiral SMARTS)
2004       FastSingleMatch(mol,pat,mlist);
2005     } else {
2006       // perform normal match (chirality ignored and checked below)
2007       OBSSMatch ssm(mol,pat);
2008       ssm.Match(mlist);
2009     }
2010 
2011     if (pat->ischiral) {
2012       std::vector<std::vector<int> >::iterator m;
2013       std::vector<std::vector<int> > tmpmlist;
2014 
2015       tmpmlist.clear();
2016       // iterate over the atom mappings
2017       for (m = mlist.begin();m != mlist.end();++m) {
2018 
2019         bool allStereoCentersMatch = true;
2020 
2021         // for each pattern atom
2022         for (int j = 0; j < pat->acount; ++j) {
2023           // skip non-chiral pattern atoms
2024           if (!pat->atom[j].chiral_flag)
2025             continue;
2026           // ignore @? in smarts, parse like any other smarts
2027           if (pat->atom[j].chiral_flag == AL_UNSPECIFIED)
2028             continue;
2029 
2030           // use the mapping the get the chiral atom in the molecule being queried
2031           OBAtom *center = mol.GetAtom((*m)[j]);
2032 
2033           // get the OBTetrahedralStereo::Config from the molecule
2034           OBStereoFacade stereo(&mol);
2035           OBTetrahedralStereo *ts = stereo.GetTetrahedralStereo(center->GetId());
2036           if (!ts || !ts->GetConfig().specified) {
2037             // no stereochemistry specified in molecule for the atom
2038             // corresponding to the chiral pattern atom using the current
2039             // mapping --> no match
2040             allStereoCentersMatch = false;
2041             break;
2042           }
2043 
2044           std::vector<int> nbrs = pat->atom[j].nbrs;
2045 
2046           if (nbrs.size() != 4) { // 3 nbrs currently not supported. Other values are errors.
2047             //stringstream ss;
2048             //ss << "Ignoring stereochemistry. There are " << nbrs.size() << " connections to this atom instead of 4. Title: " << mol.GetTitle();
2049             //obErrorLog.ThrowError(__FUNCTION__, ss.str(), obWarning);
2050             continue;
2051           }
2052 
2053           // construct a OBTetrahedralStereo::Config using the smarts pattern
2054           OBTetrahedralStereo::Config smartsConfig;
2055           smartsConfig.center = center->GetId();
2056           if (nbrs.at(0) == SmartsImplicitRef)
2057             smartsConfig.from = OBStereo::ImplicitRef;
2058           else
2059             smartsConfig.from = mol.GetAtom( (*m)[nbrs.at(0)] )->GetId();
2060           OBStereo::Ref firstref;
2061           if (nbrs.at(1) == SmartsImplicitRef)
2062             firstref = OBStereo::ImplicitRef;
2063           else
2064             firstref = mol.GetAtom( (*m)[nbrs.at(1)] )->GetId();
2065           OBAtom *ra2 = mol.GetAtom( (*m)[nbrs.at(2)] );
2066           OBAtom *ra3 = mol.GetAtom( (*m)[nbrs.at(3)] );
2067           smartsConfig.refs = OBStereo::MakeRefs(firstref, ra2->GetId(), ra3->GetId());
2068 
2069           smartsConfig.view = OBStereo::ViewFrom;
2070           switch (pat->atom[j].chiral_flag) {
2071             case AL_CLOCKWISE:
2072               smartsConfig.winding = OBStereo::Clockwise;
2073               break;
2074             case AL_ANTICLOCKWISE:
2075               smartsConfig.winding = OBStereo::AntiClockwise;
2076               break;
2077             default:
2078               smartsConfig.specified = false;
2079           }
2080 
2081           // cout << "smarts config = " << smartsConfig << endl;
2082           // cout << "molecule config = " << ts->GetConfig() << endl;
2083           // cout << "match = " << (ts->GetConfig() == smartsConfig) << endl;
2084 
2085           // and save the match if the two configurations are the same
2086           if (ts->GetConfig() != smartsConfig)
2087             allStereoCentersMatch = false;
2088 
2089           // don't waste time checking more stereocenters using this mapping if one didn't match
2090           if (!allStereoCentersMatch)
2091             break;
2092         }
2093 
2094         // if all the atoms in the molecule match the stereochemistry specified
2095         // in the smarts pattern, save this mapping as a match
2096         if (allStereoCentersMatch)
2097           tmpmlist.push_back(*m);
2098       }
2099 
2100       mlist = tmpmlist;
2101     }
2102 
2103     return(!mlist.empty());
2104   }
2105 
EvalAtomExpr(AtomExpr * expr,OBAtom * atom)2106   bool OBSmartsMatcher::EvalAtomExpr(AtomExpr *expr,OBAtom *atom)
2107   {
2108     for (;;)
2109       switch (expr->type)
2110         {
2111         case AE_TRUE:
2112           return true;
2113         case AE_FALSE:
2114           return false;
2115         case AE_AROMATIC:
2116           return atom->IsAromatic();
2117         case AE_ALIPHATIC:
2118           return !atom->IsAromatic();
2119         case AE_CYCLIC:
2120           return atom->IsInRing();
2121         case AE_ACYCLIC:
2122           return !atom->IsInRing();
2123 
2124         case AE_MASS:
2125           return expr->leaf.value == atom->GetIsotope();
2126         case AE_ELEM:
2127           return expr->leaf.value == (int)atom->GetAtomicNum();
2128         case AE_AROMELEM:
2129           return expr->leaf.value == (int)atom->GetAtomicNum() &&
2130                  atom->IsAromatic();
2131         case AE_ALIPHELEM:
2132           return expr->leaf.value == (int)atom->GetAtomicNum() &&
2133                  !atom->IsAromatic();
2134         case AE_HCOUNT:
2135           return expr->leaf.value == ((int)atom->ExplicitHydrogenCount() +
2136                                       (int)atom->GetImplicitHCount());
2137         case AE_CHARGE:
2138           return expr->leaf.value == atom->GetFormalCharge();
2139         case AE_CONNECT:
2140           return expr->leaf.value == (int)atom->GetTotalDegree();
2141         case AE_DEGREE:
2142           return expr->leaf.value == (int)atom->GetExplicitDegree();
2143         case AE_IMPLICIT:
2144           return expr->leaf.value == (int)atom->GetImplicitHCount();
2145         case AE_RINGS:
2146           return expr->leaf.value == (int)atom->MemberOfRingCount();
2147         case AE_SIZE:
2148           return atom->IsInRingSize(expr->leaf.value);
2149         case AE_VALENCE:
2150           return expr->leaf.value == (int)atom->GetTotalValence();
2151         case AE_CHIRAL:
2152           // always return true (i.e. accept the match) and check later
2153           return true;
2154         case AE_HYB:
2155           return expr->leaf.value == (int)atom->GetHyb();
2156         case AE_RINGCONNECT:
2157           return expr->leaf.value == (int)atom->CountRingBonds();
2158 
2159         case AE_NOT:
2160           return !EvalAtomExpr(expr->mon.arg,atom);
2161         case AE_ANDHI: /* Same as AE_ANDLO */
2162         case AE_ANDLO:
2163           if (!EvalAtomExpr(expr->bin.lft,atom))
2164             return false;
2165           expr = expr->bin.rgt;
2166           break;
2167         case AE_OR:
2168           if (EvalAtomExpr(expr->bin.lft,atom))
2169             return true;
2170           expr = expr->bin.rgt;
2171           break;
2172 
2173         case AE_RECUR:
2174           {
2175             //see if pattern has been matched
2176             std::vector<std::pair<const Pattern*,std::vector<bool> > >::iterator i;
2177             for (i = RSCACHE.begin();i != RSCACHE.end();++i)
2178               if (i->first == (Pattern*)expr->recur.recur)
2179                 return(i->second[atom->GetIdx()]);
2180 
2181             //perceive and match pattern
2182             std::vector<std::vector<int> >::iterator j;
2183             std::vector<bool> vb(((OBMol*) atom->GetParent())->NumAtoms()+1);
2184             std::vector<std::vector<int> > mlist;
2185             if (match( *((OBMol *) atom->GetParent()),
2186                        (Pattern*)expr->recur.recur,mlist))
2187               for (j = mlist.begin();j != mlist.end();++j)
2188                 vb[(*j)[0]] = true;
2189 
2190             RSCACHE.push_back(std::pair<const Pattern*,
2191                               std::vector<bool> > ((const Pattern*)expr->recur.recur,
2192                                                    vb));
2193 
2194             return(vb[atom->GetIdx()]);
2195           }
2196 
2197         default:
2198           return false;
2199         }
2200   }
2201 
EvalBondExpr(BondExpr * expr,OBBond * bond)2202   bool OBSmartsMatcher::EvalBondExpr(BondExpr *expr,OBBond *bond)
2203   {
2204     for (;;)
2205       switch( expr->type )
2206         {
2207         case BE_ANDHI:
2208         case BE_ANDLO:
2209           if (!EvalBondExpr(expr->bin.lft,bond))
2210             return false;
2211           expr = expr->bin.rgt;
2212           break;
2213 
2214         case BE_OR:
2215           if (EvalBondExpr(expr->bin.lft,bond))
2216             return true;
2217           expr = expr->bin.rgt;
2218           break;
2219 
2220         case BE_NOT:
2221           return !EvalBondExpr(expr->mon.arg,bond);
2222 
2223         case BE_ANY:
2224           return true;
2225         case BE_DEFAULT:
2226           return bond->GetBondOrder()==1 || bond->IsAromatic();
2227         case BE_SINGLE:
2228           return bond->GetBondOrder()==1 && !bond->IsAromatic();
2229         case BE_DOUBLE:
2230           return bond->GetBondOrder()==2 && !bond->IsAromatic();
2231         case BE_TRIPLE:
2232           return bond->GetBondOrder() == 3;
2233         case BE_QUAD:
2234           return bond->GetBondOrder() == 4;
2235         case BE_AROM:
2236           return bond->IsAromatic();
2237         case BE_RING:
2238           return bond->IsInRing();
2239         //case BE_UP:
2240         //  return bond->IsUp();
2241         //case BE_DOWN:
2242         //  return bond->IsDown();
2243         //case BE_UPUNSPEC: // up or unspecified (i.e., not down)
2244         //  return !bond->IsDown();
2245         //case BE_DOWNUNSPEC: // down or unspecified (i.e., not up)
2246         //  return !bond->IsUp();
2247         default:
2248           return false;
2249         }
2250   }
2251 
GetUMapList()2252   std::vector<std::vector<int> > &OBSmartsPattern::GetUMapList()
2253   {
2254     if (_mlist.empty() || _mlist.size() == 1)
2255       return(_mlist);
2256 
2257     bool ok;
2258     OBBitVec bv;
2259     std::vector<OBBitVec> vbv;
2260     std::vector<std::vector<int> > mlist;
2261     std::vector<std::vector<int> >::iterator i;
2262     std::vector<OBBitVec>::iterator j;
2263 
2264     for (i = _mlist.begin();i != _mlist.end();++i)
2265       {
2266         ok = true;
2267         bv.Clear();
2268         bv.FromVecInt(*i);
2269         for (j = vbv.begin();j != vbv.end() && ok;++j)
2270           if ((*j) == bv)
2271             ok = false;
2272 
2273         if (ok)
2274           {
2275             mlist.push_back(*i);
2276             vbv.push_back(bv);
2277           }
2278       }
2279 
2280     _mlist = mlist;
2281     return(_mlist);
2282   }
2283 
WriteMapList(ostream & ofs)2284   void OBSmartsPattern::WriteMapList(ostream &ofs)
2285   {
2286     std::vector<std::vector<int> >::iterator i;
2287     std::vector<int>::iterator j;
2288 
2289     for ( i = _mlist.begin() ; i != _mlist.end() ; ++i )
2290       {
2291         for (j = (*i).begin();j != (*i).end();++j)
2292           ofs << *j << ' ' << ends;
2293         ofs << endl;
2294       }
2295   }
2296 
2297   //*******************************************************************
2298   //  The OBSSMatch class performs exhaustive matching using recursion
2299   //  Explicit stack handling is used to find just a single match in
2300   //  match()
2301   //*******************************************************************
2302 
OBSSMatch(OBMol & mol,const Pattern * pat)2303   OBSSMatch::OBSSMatch(OBMol &mol, const Pattern *pat)
2304   {
2305     _mol = &mol;
2306     _pat = pat;
2307     _map.resize(pat->acount);
2308 
2309     if (!mol.Empty())
2310       {
2311         _uatoms = new bool [mol.NumAtoms()+1];
2312         memset((char*)_uatoms,'\0',sizeof(bool)*(mol.NumAtoms()+1));
2313       }
2314     else
2315       _uatoms = nullptr;
2316   }
2317 
~OBSSMatch()2318   OBSSMatch::~OBSSMatch()
2319   {
2320     if (_uatoms)
2321       delete [] _uatoms;
2322   }
2323 
Match(std::vector<std::vector<int>> & mlist,int bidx)2324   void OBSSMatch::Match(std::vector<std::vector<int> > &mlist,int bidx)
2325   {
2326 	  OBSmartsMatcher matcher;
2327     if (bidx == -1)
2328       {
2329         OBAtom *atom;
2330         std::vector<OBAtom*>::iterator i;
2331         for (atom = _mol->BeginAtom(i);atom;atom = _mol->NextAtom(i))
2332           if (matcher.EvalAtomExpr(_pat->atom[0].expr,atom))
2333             {
2334               _map[0] = atom->GetIdx();
2335               _uatoms[atom->GetIdx()] = true;
2336               Match(mlist,0);
2337               _map[0] = 0;
2338               _uatoms[atom->GetIdx()] = false;
2339             }
2340         return;
2341       }
2342 
2343     if (bidx == _pat->bcount) //save full match here
2344       {
2345         mlist.push_back(_map);
2346         return;
2347       }
2348 
2349     if (_pat->bond[bidx].grow) //match the next bond
2350       {
2351         int src = _pat->bond[bidx].src;
2352         int dst = _pat->bond[bidx].dst;
2353 
2354         if (_map[src] <= 0 || _map[src] > (signed)_mol->NumAtoms())
2355           return;
2356 
2357         AtomExpr *aexpr = _pat->atom[dst].expr;
2358         BondExpr *bexpr = _pat->bond[bidx].expr;
2359         OBAtom *atom,*nbr;
2360         std::vector<OBBond*>::iterator i;
2361 
2362         atom = _mol->GetAtom(_map[src]);
2363         for (nbr = atom->BeginNbrAtom(i);nbr;nbr = atom->NextNbrAtom(i))
2364           if (!_uatoms[nbr->GetIdx()] && matcher.EvalAtomExpr(aexpr,nbr) &&
2365         		  matcher.EvalBondExpr(bexpr,((OBBond*) *i)))
2366             {
2367               _map[dst] = nbr->GetIdx();
2368               _uatoms[nbr->GetIdx()] = true;
2369               Match(mlist,bidx+1);
2370               _uatoms[nbr->GetIdx()] = false;
2371               _map[dst] = 0;
2372             }
2373       }
2374     else //just check bond here
2375       {
2376         OBBond *bond = _mol->GetBond(_map[_pat->bond[bidx].src],
2377                                      _map[_pat->bond[bidx].dst]);
2378         if (bond && matcher.EvalBondExpr(_pat->bond[bidx].expr,bond))
2379           Match(mlist,bidx+1);
2380       }
2381   }
2382 
GetExprOrder(BondExpr * expr)2383   static int GetExprOrder(BondExpr *expr)
2384   {
2385     int tmp1,tmp2;
2386 
2387     switch( expr->type )
2388       {
2389       case BE_SINGLE:  return 1;
2390       case BE_DOUBLE:  return 2;
2391       case BE_TRIPLE:  return 3;
2392       case BE_QUAD:    return 4;
2393       case BE_AROM:    return 5;
2394 
2395       case BE_UP:
2396       case BE_DOWN:
2397       case BE_UPUNSPEC:
2398       case BE_DOWNUNSPEC:
2399         return 1;
2400 
2401       case BE_ANDHI:
2402       case BE_ANDLO:
2403         tmp1 = GetExprOrder(expr->bin.lft);
2404         tmp2 = GetExprOrder(expr->bin.rgt);
2405         if (tmp1 == 0) return tmp2;
2406         if (tmp2 == 0) return tmp1;
2407         if (tmp1 == tmp2) return tmp1;
2408         break;
2409 
2410       case BE_OR:
2411         tmp1 = GetExprOrder(expr->bin.lft);
2412         if (tmp1 == 0) return 0;
2413         tmp2 = GetExprOrder(expr->bin.rgt);
2414         if (tmp2 == 0) return 0;
2415         if (tmp1 == tmp2) return tmp1;
2416         break;
2417       }
2418 
2419     return 0;
2420   }
2421 
GetExprCharge(AtomExpr * expr)2422   static int GetExprCharge(AtomExpr *expr)
2423   {
2424     int tmp1,tmp2;
2425 
2426     switch( expr->type )
2427       {
2428       case AE_CHARGE:
2429         return expr->leaf.value;
2430 
2431       case AE_ANDHI:
2432       case AE_ANDLO:
2433         tmp1 = GetExprCharge(expr->bin.lft);
2434         tmp2 = GetExprCharge(expr->bin.rgt);
2435         if (tmp1 == 0) return tmp2;
2436         if (tmp2 == 0) return tmp1;
2437         if (tmp1 == tmp2) return tmp1;
2438         break;
2439 
2440       case AE_OR:
2441         tmp1 = GetExprCharge(expr->bin.lft);
2442         if (tmp1 == 0) return 0;
2443         tmp2 = GetExprCharge(expr->bin.rgt);
2444         if (tmp2 == 0) return 0;
2445         if (tmp1 == tmp2) return tmp1;
2446         break;
2447       }
2448 
2449     return 0;
2450   }
2451 
GetCharge(int idx)2452   int OBSmartsPattern::GetCharge(int idx)
2453   {
2454     return GetExprCharge(_pat->atom[idx].expr);
2455   }
2456 
GetExprAtomicNum(AtomExpr * expr)2457   static int GetExprAtomicNum(AtomExpr *expr)
2458   {
2459     int tmp1,tmp2;
2460 
2461     switch( expr->type )
2462       {
2463       case AE_ELEM:
2464       case AE_AROMELEM:
2465       case AE_ALIPHELEM:
2466         return expr->leaf.value;
2467 
2468       case AE_ANDHI:
2469       case AE_ANDLO:
2470         tmp1 = GetExprAtomicNum(expr->bin.lft);
2471         tmp2 = GetExprAtomicNum(expr->bin.rgt);
2472         if (tmp1 == 0) return tmp2;
2473         if (tmp2 == 0) return tmp1;
2474         if (tmp1 == tmp2) return tmp1;
2475         break;
2476 
2477       case AE_OR:
2478         tmp1 = GetExprAtomicNum(expr->bin.lft);
2479         if (tmp1 == 0) return 0;
2480         tmp2 = GetExprAtomicNum(expr->bin.rgt);
2481         if (tmp2 == 0) return 0;
2482         if (tmp1 == tmp2) return tmp1;
2483         break;
2484       }
2485 
2486     return 0;
2487   }
2488 
GetAtomicNum(int idx)2489   int OBSmartsPattern::GetAtomicNum(int idx)
2490   {
2491     return GetExprAtomicNum(_pat->atom[idx].expr);
2492   }
2493 
GetBond(int & src,int & dst,int & ord,int idx)2494   void OBSmartsPattern::GetBond(int &src,int &dst,int &ord,int idx)
2495   {
2496     src = _pat->bond[idx].src;
2497     dst = _pat->bond[idx].dst;
2498     ord = GetExprOrder(_pat->bond[idx].expr);
2499   }
2500 
SmartsLexReplace(std::string & s,std::vector<std::pair<std::string,std::string>> & vlex)2501   void SmartsLexReplace(std::string &s,std::vector<std::pair<std::string,std::string> > &vlex)
2502   {
2503     size_t j,pos;
2504     std::string token,repstr;
2505     std::vector<std::pair<std::string,std::string> >::iterator i;
2506 
2507     for (pos = 0,pos = s.find("$",pos);pos < s.size();pos = s.find("$",pos))
2508       //for (pos = 0,pos = s.find("$",pos);pos != std::string::npos;pos = s.find("$",pos))
2509       {
2510         pos++;
2511         for (j = pos;j < s.size();++j)
2512           if (!isalpha(s[j]) && !isdigit(s[j]) && s[j] != '_')
2513             break;
2514         if (pos == j)
2515           continue;
2516 
2517         token = s.substr(pos,j-pos);
2518         for (i = vlex.begin();i != vlex.end();++i)
2519           if (token == i->first)
2520             {
2521               repstr = "(" + i->second + ")";
2522               s.replace(pos,j-pos,repstr);
2523               j = 0;
2524             }
2525         pos = j;
2526       }
2527   }
2528 
2529 } // end namespace OpenBabel
2530 
2531 //! \file parsmart.cpp
2532 //! \brief Implementation of Daylight SMARTS parser.
2533