1 /**********************************************************************
2 unique.cpp - A OBOp for eliminating chemically identical molecules during conversion.
3
4 Copyright (C) 2009 by Chris Morley
5
6 This file is part of the Open Babel project.
7 For more information, see <http://openbabel.org/>
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation version 2 of the License.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17 ***********************************************************************/
18 #include <openbabel/babelconfig.h>
19 #include <openbabel/op.h>
20 #include <openbabel/mol.h>
21 #include <openbabel/obconversion.h>
22 #include <openbabel/descriptor.h>
23 #include <openbabel/inchiformat.h>
24 #if defined(_MSC_VER) || defined(_LIBCPP_VERSION)
25 #include <unordered_map>
26 #elif (__GNUC__ == 4 && __GNUC_MINOR__ >= 1 && !defined(__APPLE_CC__))
27 #include <tr1/unordered_map>
28 #else
29 #ifdef USE_BOOST
30 #include <boost/tr1/unordered_map.hpp>
31 #else
32 #define NO_UNORDERED_MAP
33 #include <map>
34 #endif
35 #endif
36
37 using namespace std;
38 #ifndef NO_UNORDERED_MAP
39 #ifdef _LIBCPP_VERSION
40 using std::unordered_map;
41 #else
42 using std::tr1::unordered_map;
43 #endif
44 #endif
45 namespace OpenBabel
46 {
47
48 class OpUnique : public OBOp
49 {
50 public:
OpUnique(const char * ID)51 OpUnique(const char* ID) : OBOp(ID, false){
52 OBConversion::RegisterOptionParam("unique", nullptr, 1, OBConversion::GENOPTIONS);
53 }
54
Description()55 const char* Description(){ return
56 "[param] remove duplicates by descriptor;default inchi\n"
57 "param is a descriptor or property, or a truncation spec for InChI\n"
58 "(making the comparison less detailed, see below).\n"
59 "An OpenBabel warning message is output for each duplicate.\n"
60 "Examples: --unique --unique cansmi --unique /nostereo\n\n"
61
62 "The duplicates can be output instead by making the first character\n"
63 "in the parameter ~ e.g. --unique ~cansmi --unique ~\n\n"
64
65 "/formula formula only\n"
66 "/connect formula and connectivity only\n"
67 "/nostereo ignore E/Z and sp3 stereochemistry\n"
68 "/nosp3 ignore sp3 stereochemistry\n"
69 "/noEZ ignore E/Z steroeochemistry\n"
70 "/nochg ignore charge and protonation\n"
71 "/noiso ignore isotopes\n\n"
72 ; }
73
WorksWith(OBBase * pOb) const74 virtual bool WorksWith(OBBase* pOb) const { return dynamic_cast<OBMol*>(pOb) != nullptr; }
75 virtual bool Do(OBBase* pOb, const char* OptionText, OpMap* pmap, OBConversion* pConv);
76
77 private:
78
79 bool _reportDup;
80 std::string _trunc;
81 OBDescriptor* _pDesc;
82 unsigned _ndups;
83 bool _inv;
84
85 #ifdef NO_UNORDERED_MAP
86 typedef map<std::string, std::string> UMap;
87 #else
88 typedef unordered_map<std::string, std::string> UMap;
89 #endif
90
91 //key is descriptor text(usually inchi) value is molecule title
92 UMap _inchimap;
93 };
94
95 /////////////////////////////////////////////////////////////////
96 OpUnique theOpUnique("unique"); //Global instance
97
98 /////////////////////////////////////////////////////////////////
Do(OBBase * pOb,const char * OptionText,OpMap * pmap,OBConversion * pConv)99 bool OpUnique::Do(OBBase* pOb, const char* OptionText, OpMap* pmap, OBConversion* pConv)
100 {
101 OBMol* pmol = dynamic_cast<OBMol*>(pOb);
102 if(!pmol)
103 return false;
104
105 if(pConv->IsFirstInput())
106 {
107 _ndups=0;
108 string descID("inchi"); // the default
109 _trunc.clear();
110 _inv = OptionText[0]=='~'; //has the parameter a leading ~ ?
111 if(_inv)
112 clog << "The output has the duplicate structures" << endl;
113
114 if(OptionText[0+_inv]=='/') //is parameter is /x?
115 _trunc = OptionText+_inv;
116 else if(OptionText[0+_inv]!='\0') // not empty?
117 descID = OptionText+_inv;
118
119 _pDesc = OBDescriptor::FindType(descID.c_str());
120 if(!_pDesc)
121 {
122 obErrorLog.ThrowError(__FUNCTION__,
123 "Cannot find descriptor " + descID, obError, onceOnly);
124 return false;
125 }
126 _pDesc->Init();
127 _inchimap.clear();
128
129 _reportDup = !_inv; //do not report duplicates when they are the output
130 }
131
132 if(!_pDesc)
133 return false;
134 std::string s;
135 _pDesc->GetStringValue(pmol, s);
136
137 if(!_trunc.empty())
138 InChIFormat::EditInchi(s, _trunc);
139 std::pair<UMap::iterator, bool> result = _inchimap.insert(make_pair(s, pmol->GetTitle()));
140 bool ret = true;
141 if(!s.empty() && !result.second)
142 {
143 // InChI is already present in set
144 ++_ndups;
145 if(_reportDup)
146 clog << "Removed " << pmol->GetTitle() << " - a duplicate of " << result.first->second
147 << " (#" << _ndups << ")" << endl;
148 //delete pOb;
149 ret = false; //filtered out
150 }
151 if(_inv)
152 ret = !ret;
153 if(!ret)
154 delete pOb;
155 return ret;
156 }
157
158
159 }//namespace
160 /*
161 Usage: --unique param
162 During conversion, this option eliminates molecules that are identical by some criterion.
163 With current babel interface it needs to be last on the command line. With nbabel
164 it can be anywhere.
165 If param is missing the criterion is the InChI.
166 If param starts with / the criterion is a truncated InChI string, see below.
167 Otherwise param is taken as a descriptor or property ID and the criterion is
168 its string value. Descriptors which couldbe useful here are cansmi, cansmiNS
169 (ignores stereo) and possibly title.
170
171 OpUnique works by attempting to insert the string value of the descriptor for
172 each molecule to an internal std::unordered_map. If the string has been seen
173 previously, the molecule is deleted and OpUnique::Do() returns false, which
174 causes the molecule not to be output.
175
176 InChI trucation values. param can be a concatination of these e.g. /nochg/noiso
177 /formula formula only
178 /connect formula and connectivity only
179 /nostereo ignore E/Z and sp3 stereochemistry
180 /nosp3 ignore sp3 stereochemistry
181 /noEZ ignore E/Z steroeochemistry
182 /nochg ignore charge and protonation
183 /noiso ignore isotopes
184
185 */
186