1 /**********************************************************************
2 fingerprint.cpp - Implementation of fingerpring base class and fastsearching
3 
4 Copyright (C) 2005 by Chris Morley
5 
6 This file is part of the Open Babel project.
7 For more information, see <http://openbabel.org/>
8 
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation version 2 of the License.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 ***********************************************************************/
18 
19 #include <openbabel/babelconfig.h>
20 
21 #include <vector>
22 #include <algorithm>
23 #include <iosfwd>
24 #include <cstring>
25 #include <fstream>
26 
27 #include <openbabel/fingerprint.h>
28 #include <openbabel/oberror.h>
29 
30 using namespace std;
31 namespace OpenBabel
32 {
33 #if defined(__CYGWIN__) || defined(__MINGW32__)
34   // macro to implement static OBPlugin::PluginMapType& Map()
35   PLUGIN_CPP_FILE(OBFingerprint)
36 #endif
37 
38   const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int);
39 
SetBit(vector<unsigned int> & vec,const unsigned int n)40   void OBFingerprint::SetBit(vector<unsigned int>& vec, const unsigned int n)
41   {
42     vec[n/Getbitsperint()] |= (1 << (n % Getbitsperint()));
43   }
44 
GetBit(const vector<unsigned int> & vec,const unsigned int n)45   bool OBFingerprint::GetBit(const vector<unsigned int>& vec, const unsigned int n)
46   {
47     unsigned int word =vec[n/Getbitsperint()];
48     return (word &= (1 << (n % Getbitsperint())))!=0;
49   }
50 
51   ////////////////////////////////////////
Fold(vector<unsigned int> & vec,unsigned int nbits)52   void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits)
53   {
54     if(nbits<Getbitsperint())
55     {
56       stringstream ss;
57       ss << "Can't fold to less than " << Getbitsperint() << "bits";
58       obErrorLog.ThrowError(__FUNCTION__, ss.str(), obError);
59       return;
60     }
61     // "folding" to a larger # of bits
62     if (nbits > vec.size()*Getbitsperint()) {
63       vec.resize(nbits/Getbitsperint(), 0);
64     }
65     else {
66       // normal folding to smaller vector sizes
67       while(vec.size()*Getbitsperint()/2 >= nbits)
68         vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2,
69                             vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end());
70     }
71   }
72 
73   ////////////////////////////////////////
74 /*  bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt)
75   {
76     Fptpos iter;
77     if(id.empty())
78       iter=FPtsMap().begin();
79     else
80       {
81         iter=FPtsMap().find(id);
82         if(iter!=FPtsMap().end())
83           ++iter;
84       }
85     if(iter==FPtsMap().end())
86       return false;
87     id    = iter->first;
88     pFPrt = iter->second;
89     return true;
90   }
91 
92   OBFingerprint* OBFingerprint::FindFingerprint(const string& ID)
93   {
94     if(ID.empty())
95       return _pDefault;
96     Fptpos iter = FPtsMap().find(ID);
97     if(iter==FPtsMap().end())
98       return NULL;
99     else
100       return iter->second;
101   }
102 */
Tanimoto(const vector<unsigned int> & vec1,const vector<unsigned int> & vec2)103   double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2)
104   {
105     //Independent of sizeof(unsigned int)
106     if(vec1.size()!=vec2.size())
107       return -1; //different number of bits
108     int andbits=0, orbits=0;
109     for (unsigned i=0;i<vec1.size();++i)
110       {
111         int andfp = vec1[i] & vec2[i];
112         int orfp = vec1[i] | vec2[i];
113         //Count bits
114       /* GCC 3.4 supports a "population count" builtin, which on many targets is
115          implemented with a single instruction.  There is a fallback definition
116          in libgcc in case a target does not have one, which should be just as
117          good as the static function below.  */
118 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
119         andbits += __builtin_popcount(andfp);
120         orbits += __builtin_popcount(orfp);
121 #else
122         for(;andfp;andfp=andfp<<1)
123           if(andfp<0) ++andbits;
124         for(;orfp;orfp=orfp<<1)
125           if(orfp<0) ++orbits;
126 #endif
127       }
128     if(orbits==0)
129       return 0.0;
130     return((double)andbits/(double)orbits);
131   }
132 
133   //*****************************************************************
Find(OBBase * pOb,vector<unsigned long> & SeekPositions,unsigned int MaxCandidates)134   bool FastSearch::Find(OBBase* pOb, vector<unsigned long>& SeekPositions,
135                         unsigned int MaxCandidates)
136   {
137     ///Finds chemical objects in datafilename (which must previously have been indexed)
138     ///that have all the same bits set in their fingerprints as in the fingerprint of
139     ///a pattern object. (Usually a substructure search in molecules.)
140     ///The type of fingerprint and its degree of folding does not have to be specified
141     ///here because the values in the index file are used.
142     ///The positions of the candidate matching molecules in the original datafile are returned.
143 
144     vector<unsigned int> vecwords;
145     _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint());
146 
147     vector<unsigned int>candidates; //indices of matches from fingerprint screen
148     candidates.reserve(MaxCandidates);
149 
150     unsigned int dataSize = _index.header.nEntries;
151     //	GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype);
152 
153     unsigned int words = _index.header.words;
154     unsigned int* nextp = &_index.fptdata[0];
155     unsigned int* ppat0 = &vecwords[0];
156     unsigned int* p;
157     unsigned int* ppat;
158     unsigned int i;
159     for(i=0;i<dataSize; ++i) //speed critical section
160       {
161         p=nextp;
162         nextp += words;
163         ppat=ppat0;
164         bool ppat_has_additional_bits = false;
165         while(p<nextp)
166           {
167             if ((*ppat & *p) ^ *ppat) { // any bits in ppat that are not in p?
168               ppat_has_additional_bits = true;
169               break;
170             }
171             p++;
172             ppat++;
173           }
174         if(!ppat_has_additional_bits)
175           {
176             candidates.push_back(i);
177             if(candidates.size()>=MaxCandidates)
178               break;
179           }
180       }
181 
182     if(i<_index.header.nEntries) //premature end to search
183       {
184         stringstream errorMsg;
185         errorMsg << "Stopped looking after " << i << " molecules." << endl;
186         obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning);
187       }
188 
189     vector<unsigned int>::iterator itr;
190     for(itr=candidates.begin();itr!=candidates.end();++itr)
191       {
192         SeekPositions.push_back(_index.seekdata[*itr]);
193       }
194     return true;
195   }
196 
197 ////////////////////////////////////////////////////////////
FindMatch(OBBase * pOb,vector<unsigned long> & SeekPositions,unsigned int MaxCandidates)198  bool FastSearch::FindMatch(OBBase* pOb, vector<unsigned long>& SeekPositions,
199                             unsigned int MaxCandidates)
200 {
201 //Similar to FastSearch::Find() except that successful candidates have all bits the same as the target
202   vector<unsigned int> vecwords;
203   _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint());
204 
205   vector<unsigned int>candidates; //indices of matches from fingerprint screen
206 
207   unsigned int dataSize = _index.header.nEntries;
208   unsigned int words = _index.header.words;
209   unsigned int* nextp = &_index.fptdata[0]; // start of next FP in index
210   unsigned int* ppat0 = &vecwords[0];       // start of target FP
211   unsigned int* p;                          // current position in index
212   unsigned int* ppat;                       // current position in target FP
213   unsigned int i; // need address of this, can't be register
214   for(i=0;i<dataSize; ++i) //speed critical section
215   {
216     p=nextp;
217     nextp += words;
218     ppat=ppat0;
219 
220     while((*p++ == *ppat++ ) && (p<nextp));
221 
222     if(p==nextp)
223     {
224       candidates.push_back(i);
225       if(candidates.size()>=MaxCandidates)
226         break;
227     }
228   }
229 
230   vector<unsigned int>::iterator itr;
231   for(itr=candidates.begin();itr!=candidates.end();++itr)
232     {
233       SeekPositions.push_back(_index.seekdata[*itr]);
234     }
235   return true;
236 }
237 
238   /////////////////////////////////////////////////////////
FindSimilar(OBBase * pOb,multimap<double,unsigned long> & SeekposMap,double MinTani,double MaxTani)239   bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned long>& SeekposMap,
240                                double MinTani, double MaxTani)
241   {
242     vector<unsigned int> targetfp;
243     _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint());
244 
245     unsigned int words = _index.header.words;
246     unsigned int dataSize = _index.header.nEntries;
247     unsigned int* nextp = &_index.fptdata[0];
248     unsigned int* p;
249     unsigned int i;
250     for(i=0;i<dataSize; ++i) //speed critical section
251       {
252         p=nextp;
253         nextp += words;
254         double tani = OBFingerprint::Tanimoto(targetfp,p);
255         if(tani>MinTani && tani < MaxTani)
256           SeekposMap.insert(pair<const double, unsigned long>(tani,_index.seekdata[i]));
257       }
258     return true;
259   }
260 
261   /////////////////////////////////////////////////////////
FindSimilar(OBBase * pOb,multimap<double,unsigned long> & SeekposMap,int nCandidates)262   bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned long>& SeekposMap,
263                                int nCandidates)
264   {
265     ///If nCandidates is zero or omitted the original size of the multimap is used
266     if(nCandidates)
267       {
268         //initialise the multimap with nCandidate zero entries
269         SeekposMap.clear();
270         int i;
271         for(i=0;i<nCandidates;++i)
272           SeekposMap.insert(pair<const double, unsigned long>(0,0));
273       }
274     else if(SeekposMap.size()==0)
275       return false;
276 
277     vector<unsigned int> targetfp;
278     _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint());
279 
280     unsigned int words = _index.header.words;
281     unsigned int dataSize = _index.header.nEntries;
282     unsigned int* nextp = &_index.fptdata[0];
283     unsigned int* p;
284     unsigned int i;
285     for(i=0;i<dataSize; ++i) //speed critical section
286       {
287         p=nextp;
288         nextp += words;
289         double tani = OBFingerprint::Tanimoto(targetfp,p);
290         if(tani>SeekposMap.begin()->first)
291           {
292             SeekposMap.insert(pair<const double, unsigned long>(tani,_index.seekdata[i]));
293             SeekposMap.erase(SeekposMap.begin());
294           }
295       }
296     return true;
297   }
298 
299   /////////////////////////////////////////////////////////
ReadIndex(istream * pIndexstream)300   string FastSearch::ReadIndex(istream* pIndexstream)
301   {
302     //Reads fs index from istream into member variables
303     _index.Read(pIndexstream);
304 
305     _pFP = _index.CheckFP();
306     if(!_pFP)
307       *(_index.header.datafilename) = '\0';
308 
309     return _index.header.datafilename; //will be empty on error
310   }
311 
312   //////////////////////////////////////////////////////////
ReadIndexFile(string IndexFilename)313   string FastSearch::ReadIndexFile(string IndexFilename)
314   {
315     ifstream ifs(IndexFilename.c_str(),ios::binary);
316     if(ifs)
317       return ReadIndex(&ifs);
318     else
319     {
320       string dum;
321       return dum;
322     }
323   }
324 
325   //////////////////////////////////////////////////////////
Read(istream * pIndexstream)326   bool FptIndex::Read(istream* pIndexstream)
327   {
328 //    pIndexstream->read((char*)&(header), sizeof(FptIndexHeader));
329 //    pIndexstream->seekg(header.headerlength);//allows header length to be changed
330 
331     if(!ReadHeader(pIndexstream))
332       {
333         *(header.datafilename) = '\0';
334         return false;
335       }
336 
337     unsigned long nwords = header.nEntries * header.words;
338     fptdata.resize(nwords);
339     seekdata.resize(header.nEntries);
340 
341     pIndexstream->read((char*)&(fptdata[0]), sizeof(unsigned int) * nwords);
342     if(header.seek64)
343       {
344     	pIndexstream->read((char*)&(seekdata[0]), sizeof(unsigned long) * header.nEntries);
345       }
346     else
347       { //legacy format
348 	 vector<unsigned int> tmp(header.nEntries);
349          pIndexstream->read((char*)&(tmp[0]), sizeof(unsigned int) * header.nEntries);
350 	 std::copy(tmp.begin(),tmp.end(),seekdata.begin());
351       }
352 
353     if(pIndexstream->fail())
354       {
355         *(header.datafilename) = '\0';
356         return false;
357       }
358     return true;
359   }
360 
361   //////////////////////////////////////////////////////////
ReadHeader(istream * pIndexstream)362   bool FptIndex::ReadHeader(istream* pIndexstream)
363   {
364     pIndexstream->read( (char*)&header.headerlength, sizeof(unsigned) );
365     pIndexstream->read( (char*)&header.nEntries,     sizeof(unsigned) );
366     pIndexstream->read( (char*)&header.words,        sizeof(unsigned) );
367     pIndexstream->read( (char*)&header.fpid,         sizeof(header.fpid) );
368     pIndexstream->read( (char*)&header.seek64,       sizeof(header.seek64) );
369     pIndexstream->read( (char*)&header.datafilename, sizeof(header.datafilename) );
370     return !pIndexstream->fail();
371  }
372 
373   //////////////////////////////////////////////////////////
CheckFP()374   OBFingerprint* FptIndex::CheckFP()
375   {
376     //check that fingerprint type is available
377     OBFingerprint* pFP = OBFingerprint::FindFingerprint(header.fpid);
378     if(!pFP)
379       {
380         stringstream errorMsg;
381         errorMsg << "Index has Fingerprints of type '" << header.fpid
382                  << " which is not currently loaded." << endl;
383         obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
384       }
385     return pFP; //NULL if not available
386   }
387 
388   //*******************************************************
FastSearchIndexer(string & datafilename,ostream * os,std::string & fpid,int FptBits,int nmols)389   FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os,
390                                        std::string& fpid, int FptBits, int nmols)
391   {
392     ///Starts indexing process
393     _indexstream = os;
394     _nbits=FptBits;
395     _pindex= new FptIndex;
396     _pindex->header.headerlength = 3*sizeof(unsigned)+sizeof(_pindex->header.fpid)
397                                     +sizeof(_pindex->header.datafilename);
398     strncpy(_pindex->header.fpid,fpid.c_str(),15);
399     _pindex->header.fpid[14]='\0'; //ensure fpid is terminated at 14 characters.
400     _pindex->header.seek64 = 1;
401     strncpy(_pindex->header.datafilename, datafilename.c_str(), 255);
402 
403     //just a hint to reserve size of vectors; definitive value set in destructor
404     _pindex->header.nEntries = nmols;
405 
406     //check that fingerprint type is available
407     _pFP = _pindex->CheckFP();
408     if(fpid.empty()) // add id of default FP
409       strcpy(_pindex->header.fpid, _pFP->GetID());
410 
411     //Save a small amount of time by not generating info (FP2 currently)
412     _pFP->SetFlags(_pFP->Flags() | OBFingerprint::FPT_NOINFO);
413   }
414 
415   /////////////////////////////////////////////////////////////
FastSearchIndexer(FptIndex * pindex,std::ostream * os,int nmols)416   FastSearchIndexer::FastSearchIndexer(FptIndex* pindex, std::ostream* os, int nmols)
417   {
418     //nmols is new total number of molecules
419     _indexstream = os;
420     _pindex = pindex;
421     _nbits  = _pindex->header.words * OBFingerprint::Getbitsperint();
422 
423     //just a hint to reserve size of vectors; definitive value set in destructor
424     _pindex->header.nEntries = nmols;
425 
426     //check that fingerprint type is available
427     _pFP = _pindex->CheckFP();
428   }
429 
430   /////////////////////////////////////////////////////////////
~FastSearchIndexer()431   FastSearchIndexer::~FastSearchIndexer()
432   {
433     ///Saves index file
434     FptIndexHeader& hdr = _pindex->header;
435     hdr.nEntries = _pindex->seekdata.size();
436     //Write header
437     //_indexstream->write((const char*)&hdr, sizeof(FptIndexHeader));
438     _indexstream->write( (const char*)&hdr.headerlength, sizeof(unsigned) );
439     _indexstream->write( (const char*)&hdr.nEntries,     sizeof(unsigned) );
440     _indexstream->write( (const char*)&hdr.words,        sizeof(unsigned) );
441     _indexstream->write( (const char*)&hdr.fpid,         sizeof(hdr.fpid) );
442     _indexstream->write( (const char*)&hdr.seek64,         sizeof(hdr.seek64) );
443     _indexstream->write( (const char*)&hdr.datafilename, sizeof(hdr.datafilename) );
444 
445     _indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int));
446     _indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned long));
447     if(!_indexstream)
448       obErrorLog.ThrowError(__FUNCTION__,
449                             "Difficulty writing index", obWarning);
450     delete _pindex;
451 
452     _pFP->SetFlags(_pFP->Flags() & ~OBFingerprint::FPT_NOINFO); //Clear
453   }
454 
455   ///////////////////////////////////////////////////////////////
Add(OBBase * pOb,std::streampos seekpos)456   bool FastSearchIndexer::Add(OBBase* pOb, std::streampos seekpos)
457   {
458     ///Adds a fingerprint
459 
460     vector<unsigned int> vecwords;
461     if(!_pFP)
462       return false;
463     if(_pFP->GetFingerprint(pOb, vecwords, _nbits))
464       {
465         _pindex->header.words = vecwords.size(); //Use size as returned from fingerprint
466         if(_pindex->fptdata.empty() && _pindex->header.nEntries!=0)
467         {
468           //Reserve size of vectors at start to avoid multiple realloction and copying later.
469           //Done here rather than in constructor because needs the size of the fingerprint.
470           _pindex->fptdata.reserve(_pindex->header.nEntries * _pindex->header.words);
471           _pindex->seekdata.reserve(_pindex->header.nEntries);
472         }
473         for(unsigned int i=0;i<_pindex->header.words;++i)
474           _pindex->fptdata.push_back(vecwords[i]);
475         _pindex->seekdata.push_back(seekpos);
476         return true;
477       }
478     obErrorLog.ThrowError(__FUNCTION__, "Failed to make a fingerprint", obWarning);
479     return false;
480   }
481 
482   /*!
483     \class OBFingerprint fingerprint.h <openbabel/fingerprint.h>
484     These fingerprints are condensed representation of molecules (or other objects)
485     as a list of boolean values (actually bits in a vector<unsigned>) with length
486     of a power of 2. The main motivation is for fast searching of data sources
487     containing large numbers of molecules (up to several million). Open Babel
488     provides some routines which can search text files containing lists of molecules
489     in any format. See the documentation on the class FastSearch.
490 
491     There are descriptions of molecular fingerprints at <br>
492     http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br>
493     http://www.mesaac.com/Fingerprint.htm <br>
494     Many methods of preparing fingerprints have been described, but the type supported
495     currently in OpenBabel has each bit representing a substructure (or other
496     molecular property). If a substructure is present in the molecule, then a
497     particular bit is set to 1. But because the hashing method may also map other
498     substructures to the same bit, a match does not guarantee that a particular
499     substructure is present; there may be false positives.  However, with proper design,
500     a large fraction of irrelevant molecules in a data set can be eliminated in a
501     fast search with boolean methods on the fingerprints.
502     It then becomes feasible to make a definitive substructure search by
503     conventional methods on this reduced list even if it is slow.
504 
505     OpenBabel provides a framework for applying new types of fingerprints without
506     changing any existing code. They are derived from OBFingerprint and the
507     source file is just compiled with the rest of OpenBabel. Alternatively,
508     they can be separately compiled as a DLL or shared library and discovered
509     when OpenBabel runs.
510 
511     For more on these specific implementations of fingerprints in Open
512     Babel, please take a look at the developer's wiki:
513     http://openbabel.org/wiki/Fingerprints
514 
515     Fingerprints derived from this abstract base class OBFingerprint can be for any
516     object derived from OBBase (not just for OBMol).
517     Each derived class provides an ID as a string and OBFingerprint keeps a map of
518     these to provides a pointer to the class when requested in FindFingerprint.
519 
520     <h4>-- To define a fingerprint type --</h4>
521     The classes derived form OBFingerprint are required to provide
522     a GetFingerprint() routine and a Description() routine
523     \code
524     class MyFpType : OBFingerprint
525     {
526        MyFpType(const char* id) : OBFingerprint(id){};
527 
528        virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits)
529        {
530           //Convert pOb to the required type, usually OBMol
531           OBMol* pmol = dynamic_cast<OBMol*>(pOb);
532           fp.resize(required_number_of_words);
533           ...
534           use SetBit(fp,n); to set the nth bit
535 
536           if(nbits)
537              Fold(fp, nbits);
538        }
539 
540        virtual const char* Description(){ return "Some descriptive text";}
541        ...
542     };
543     \endcode
544 
545     Declare a global instance with the ID you will use in -f options to specify
546     its use.
547     \code
548     MyFpType theMyFpType("myfpID");
549     \endcode
550 
551     <h4>-- To obtain a fingerprint --</h4>
552     \code
553     OBMol mol;
554     ...
555     vector<unsigned int> fp;
556     OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint
557     \endcode
558     or
559     \code
560     vector<unsigned int> fp;
561     OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID");
562     ...and maybe...
563     pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger
564     \endcode
565 
566     <h4>-- To print a list of available fingerprint types --</h4>
567     \code
568     std::string id;
569     OBFingerPrint* pFPrt=NULL;
570     while(OBFingerprint::GetNextFPrt(id, pFPrt))
571     {
572        cout << id << " -- " << pFPrt->Description() << endl;
573     }
574     \endcode
575 
576     Fingerprints are handled as vector<unsigned int> so that the number of bits
577     in this vector and their order will be platform and compiler
578     dependent, because of size of int types and endian differences.
579     Use fingerprints (and fastsearch indexes containing them) only
580     for comparing with other fingerprints prepared on the same machine.
581 
582     The FingerprintFormat class is an output format which displays fingerprints
583     as hexadecimal. When multiple molecules are supplied it will calculate the
584     Tanimoto coefficient from the first molecule to each of the others. It also
585     shows whether the first molecule is a possible substructure to all the others,
586     i.e. whether all the bits set in the fingerprint for the first molecule are
587     set in the fingerprint of the others. To display hexadecimal information when
588     multiple molecules are provided it is necessay to use the -xh option.
589 
590     To see a list of available format types, type obabel -F on the command line.
591     The -xF option of the FingerprintFormat class also provides this output, but due
592     to a quirk in the way the program works, it is necessary to have a valid input
593     molecule for this option to work.
594   */
595 
596   /*! \class FastSearch fingerprint.h <openbabel/fingerprint.h>
597     The FastSearch class searches an index to a datafile containing a list of molecules
598     (or other objects) prepared by FastSearchIndexer.
599 
600     OpenBabel can also search files for molecules containing a substructure specified
601     by a SMARTS string, using OBSmartsPattern or from the command line:
602     \code
603     obabel datafile.xxx -O outfile.yyy -sSMARTS
604     \endcode
605     But when the data file contains more than about 10,000 molecules this becomes
606     rather too slow to be used interactively. To do it more quickly, an index
607     of the molecules containing their fingerprints (see OBFingerprint) is prepared using
608     FastSearchIndexer. The indexing process may take a long time but only has to
609     be done once. The index can be searched very quickly with FastSearch. Because
610     of the nature of fingerprints a match to a bit does not guarantee
611     the presence of a particular substructure or other molecular property, so that
612     a definitive answer may require a subsequent search of the (much reduced) list
613     of candidates by another method (like OBSmartsPattern).
614 
615     Note that the index files are not portable. They need to be prepared on the
616     computer that will access them.
617 
618     <h4>Using FastSearch and FastSearchIndexer in a program</h4>
619 
620     The index has two tables:
621     - an array of fingerprints of the molecules
622     - an array of the seek positions in the datasource file of all the molecules
623 
624     <h4>To prepare an fastsearch index file:</h4>
625     - Open an ostream to the index file.
626     - Make a FastSearchIndexer object on the heap or the stack, passing in as parameters:
627     - the datafile name, the indexfile stream,
628     - the id of the fingerprint type to be used,
629     -  the number of bits the fingerprint is to be folded down to, If it is to be left
630     unfolded, set fpid to 0 or do not specify it.
631     .
632     - For each molecule, call Add() with its pointer and its position in the datafile.<br>
633     Currently the std::streampos value is implicitly cast to unsigned int so that
634     for 32bit machines the datafile has to be no longer than about 2Gbyte.
635     - The index file is written when the FastSearchIndexer object is deleted or goes out
636     of scope.
637 
638     <h4>To search in a fastsearch index file</h4>
639 
640     - Open a std::istream to the indexfile (in binary mode on some systems)
641     - Make a FastSearch object, read the index and open the datafile whose
642     name it provides
643     \code
644     ifstream ifs(indexname,ios::binary);
645     FastSearch fs;
646     string datafilename = fs.ReadIndex(&ifs);
647     if(datafilename.empty()
648        return false;
649 
650     ifstream datastream(datafilename);
651     if(!datastream)
652        return false;
653     \endcode
654 
655     <strong>To do a search for molecules which have all the substructure bits the
656     OBMol object, patternMol</strong>
657     \code
658     vector<unsigned int>& SeekPositions;
659     if(!fs.Find(patternMol, SeekPositions, MaxCandidates))
660 	    for(itr=SeekPositions.begin();itr!=SeekPositions.end();++itr)
661       {
662          datastream.seekg(*itr);
663          ... read the candidate molecule
664          and subject to more rigorous test if necessary
665       }
666     \endcode
667 
668     <strong>To do a similarity search based on the Tanimoto coefficient</strong>
669     This is defined as: <br>
670     <em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br>
671     where the boolean operations between the fingerprints are bitwise<br>
672     The Tanimoto coefficient has no absolute meaning and depends on
673     the design of the fingerprint.
674     \code
675     multimap<double, unsigned int> SeekposMap;
676     // to find n best molecules
677     fs.FindSimilar(&patternMol, SeekposMap, n);
678     ...or
679     // to find molecules with Tanimoto coefficient > MinTani
680     fs.FindSimilar(&patternMol, SeekposMap, MinTani);
681 
682     multimap<double, unsigned int>::reverse_iterator itr;
683     for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr)
684     {
685        datastream.seekg(itr->second);
686        // ... read the candidate molecule
687        double tani = itr->first;
688     }
689     \endcode
690 
691     The FastSearchFormat class facilitates the use of these routine from the
692     command line or other front end program. For instance:
693 
694     <strong>Prepare an index:</strong>
695     \code
696     obabel datafile.xxx -O index.fs
697     \endcode
698     With options you can specify:
699     - which type of fingerprint to be used, e.g. -xfFP2,
700     -	whether it is folded to a specified number of bits, e.g. -xn128
701     (which should be a power of 2)
702     - whether to pre-select the molecules which are indexed:
703     - by structure e.g only ethers and esters, -sCOC
704     - by excluding molecules with bezene rings, -vc1ccccc1
705     - by position in the datafile e.g. only mols 10 to 90, -f10 -l90
706     .
707     <strong>Substructure search</strong> in a previously prepared index file
708     \code
709     obabel index.fs -O outfile.yyy -sSMILES
710     \endcode
711     The datafile can also be used as the input file, provided the input format is specified as fs
712     \code
713     obabel datafile.xxx -O outfile.yyy -sSMILES -ifs
714     \endcode
715     A "slow" search not using fingerprints would be done on the same data by omitting -ifs.
716     A "slow" search can use SMARTS strings, but the fast search is restricted
717     to the subset which is valid SMILES.
718 
719     With the -S option, the target can be specified as a molecule in a file of any format
720     \code
721     obabel datafile.xxx -O outfile.yyy -Spattern_mol.zzz -ifs
722     \endcode
723     These searches have two stages: a fingerprint search which produces
724     a number of candidate molecules and a definitive search which selects
725     from these using SMARTS pattern matching. The maximum number of candidates
726     is 4000 by default but you can change this with an option
727     e.g. -al 8000  (Note that you need the space between l and the number.)
728     If the fingerprint search reaches the maximum number it will not
729     look further and will tell you at what stage it stopped.
730 
731     <strong>Similarity search</strong> in a previously prepared index file<br>
732     This rather done (rather than a substructure search) if the -at option is used,
733     \code
734     obabel datafile.xxx -O outfile.yyy -sSMILES -ifs -at0.7
735     \endcode
736     for instance
737     - -at0.7 will recover all molecules with Tanimoto greater than 0.7
738     - -at15 (no decimal point) will recover the 15 molecules with largest coefficients.
739     - -aa will add the Tanimoto coefficient to the titles of the output molecules.
740 
741     All stages, the indexing, the interpretation of the SMILES string in the -s option,
742     the file in the -S option and the final SMARTS filter convert to OBMol and apply
743     ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O  and N(=O)=O
744     are recognized as chemically identical.
745   */
746 
747 }//Openbabel
748 
749 //! \file fingerprint.cpp
750 //! \brief Definitions for OBFingerprint base class and fastsearch classes
751