1 /**********************************************************************
2 unique.cpp - A OBOp for eliminating chemically identical molecules during conversion.
3 
4 Copyright (C) 2009 by Chris Morley
5 
6 This file is part of the Open Babel project.
7 For more information, see <http://openbabel.org/>
8 
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation version 2 of the License.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 ***********************************************************************/
18 #include <openbabel/babelconfig.h>
19 #include <openbabel/op.h>
20 #include <openbabel/mol.h>
21 #include <openbabel/obconversion.h>
22 #include <openbabel/descriptor.h>
23 #include <openbabel/inchiformat.h>
24 #if defined(_MSC_VER) || defined(_LIBCPP_VERSION)
25   #include <unordered_map>
26 #elif (__GNUC__ == 4 && __GNUC_MINOR__ >= 1 && !defined(__APPLE_CC__))
27   #include <tr1/unordered_map>
28 #else
29   #ifdef USE_BOOST
30     #include <boost/tr1/unordered_map.hpp>
31   #else
32     #define NO_UNORDERED_MAP
33     #include <map>
34   #endif
35 #endif
36 
37 using namespace std;
38 #ifndef NO_UNORDERED_MAP
39   #ifdef _LIBCPP_VERSION
40     using std::unordered_map;
41   #else
42     using std::tr1::unordered_map;
43   #endif
44 #endif
45 namespace OpenBabel
46 {
47 
48 class OpUnique : public OBOp
49 {
50 public:
OpUnique(const char * ID)51   OpUnique(const char* ID) : OBOp(ID, false){
52     OBConversion::RegisterOptionParam("unique", nullptr, 1, OBConversion::GENOPTIONS);
53   }
54 
Description()55   const char* Description(){ return
56     "[param] remove duplicates by descriptor;default inchi\n"
57     "param is a descriptor or property, or a truncation spec for InChI\n"
58     "(making the comparison less detailed, see below).\n"
59     "An OpenBabel warning message is output for each duplicate.\n"
60     "Examples: --unique   --unique cansmi   --unique /nostereo\n\n"
61 
62     "The duplicates can be output instead by making the first character\n"
63     "in the parameter ~  e.g. --unique ~cansmi   --unique ~\n\n"
64 
65     "/formula  formula only\n"
66     "/connect  formula and connectivity only\n"
67     "/nostereo ignore E/Z and sp3 stereochemistry\n"
68     "/nosp3    ignore sp3 stereochemistry\n"
69     "/noEZ     ignore E/Z steroeochemistry\n"
70     "/nochg    ignore charge and protonation\n"
71     "/noiso    ignore isotopes\n\n"
72 ; }
73 
WorksWith(OBBase * pOb) const74   virtual bool WorksWith(OBBase* pOb) const { return dynamic_cast<OBMol*>(pOb) != nullptr; }
75   virtual bool Do(OBBase* pOb, const char* OptionText, OpMap* pmap, OBConversion* pConv);
76 
77 private:
78 
79   bool _reportDup;
80   std::string _trunc;
81   OBDescriptor* _pDesc;
82   unsigned _ndups;
83   bool _inv;
84 
85 #ifdef NO_UNORDERED_MAP
86   typedef map<std::string, std::string> UMap;
87 #else
88   typedef unordered_map<std::string, std::string> UMap;
89 #endif
90 
91   //key is descriptor text(usually inchi) value is molecule title
92   UMap _inchimap;
93 };
94 
95 /////////////////////////////////////////////////////////////////
96 OpUnique theOpUnique("unique"); //Global instance
97 
98 /////////////////////////////////////////////////////////////////
Do(OBBase * pOb,const char * OptionText,OpMap * pmap,OBConversion * pConv)99 bool OpUnique::Do(OBBase* pOb, const char* OptionText, OpMap* pmap, OBConversion* pConv)
100 {
101   OBMol* pmol = dynamic_cast<OBMol*>(pOb);
102   if(!pmol)
103     return false;
104 
105   if(pConv->IsFirstInput())
106   {
107     _ndups=0;
108     string descID("inchi"); // the default
109     _trunc.clear();
110     _inv = OptionText[0]=='~';   //has the parameter a leading ~ ?
111     if(_inv)
112       clog << "The output has the duplicate structures" << endl;
113 
114     if(OptionText[0+_inv]=='/')  //is parameter is /x?
115       _trunc = OptionText+_inv;
116     else if(OptionText[0+_inv]!='\0') // not empty?
117       descID = OptionText+_inv;
118 
119     _pDesc = OBDescriptor::FindType(descID.c_str());
120     if(!_pDesc)
121     {
122       obErrorLog.ThrowError(__FUNCTION__,
123               "Cannot find descriptor " + descID, obError, onceOnly);
124       return false;
125     }
126     _pDesc->Init();
127     _inchimap.clear();
128 
129     _reportDup = !_inv; //do not report duplicates when they are the output
130   }
131 
132   if(!_pDesc)
133     return false;
134   std::string s;
135   _pDesc->GetStringValue(pmol, s);
136 
137   if(!_trunc.empty())
138     InChIFormat::EditInchi(s, _trunc);
139   std::pair<UMap::iterator, bool> result = _inchimap.insert(make_pair(s, pmol->GetTitle()));
140   bool ret = true;
141   if(!s.empty() && !result.second)
142   {
143     // InChI is already present in set
144     ++_ndups;
145     if(_reportDup)
146       clog << "Removed " << pmol->GetTitle() << " - a duplicate of " << result.first->second
147          << " (#" << _ndups << ")" << endl;
148     //delete pOb;
149     ret = false; //filtered out
150   }
151   if(_inv)
152     ret = !ret;
153   if(!ret)
154     delete pOb;
155   return ret;
156 }
157 
158 
159 }//namespace
160 /*
161 Usage: --unique param
162 During conversion, this option eliminates molecules that are identical by some criterion.
163 With current babel interface it needs to be last on the command line. With nbabel
164 it can be anywhere.
165 If param is missing the criterion is the InChI.
166 If param starts with / the criterion is a truncated InChI string, see below.
167 Otherwise param is taken as a descriptor or property ID and the criterion is
168 its string value. Descriptors which couldbe useful here are cansmi, cansmiNS
169 (ignores stereo) and possibly title.
170 
171 OpUnique works by attempting to insert the string value of the descriptor for
172 each molecule to an internal std::unordered_map. If the string has been seen
173 previously, the molecule is deleted and OpUnique::Do() returns false, which
174 causes the molecule not to be output.
175 
176 InChI trucation values. param can be a concatination of these e.g. /nochg/noiso
177 /formula  formula only
178 /connect formula and connectivity only
179 /nostereo ignore E/Z and sp3 stereochemistry
180 /nosp3    ignore sp3 stereochemistry
181 /noEZ     ignore E/Z steroeochemistry
182 /nochg    ignore charge and protonation
183 /noiso    ignore isotopes
184 
185 */
186