1 /********************************************************************** 2 fingerprint.cpp - Implementation of fingerpring base class and fastsearching 3 4 Copyright (C) 2005 by Chris Morley 5 6 This file is part of the Open Babel project. 7 For more information, see <http://openbabel.org/> 8 9 This program is free software; you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation version 2 of the License. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 ***********************************************************************/ 18 19 #include <openbabel/babelconfig.h> 20 21 #include <vector> 22 #include <algorithm> 23 #include <iosfwd> 24 #include <cstring> 25 #include <fstream> 26 27 #include <openbabel/fingerprint.h> 28 #include <openbabel/oberror.h> 29 30 using namespace std; 31 namespace OpenBabel 32 { 33 #if defined(__CYGWIN__) || defined(__MINGW32__) 34 // macro to implement static OBPlugin::PluginMapType& Map() 35 PLUGIN_CPP_FILE(OBFingerprint) 36 #endif 37 38 const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int); 39 SetBit(vector<unsigned int> & vec,const unsigned int n)40 void OBFingerprint::SetBit(vector<unsigned int>& vec, const unsigned int n) 41 { 42 vec[n/Getbitsperint()] |= (1 << (n % Getbitsperint())); 43 } 44 GetBit(const vector<unsigned int> & vec,const unsigned int n)45 bool OBFingerprint::GetBit(const vector<unsigned int>& vec, const unsigned int n) 46 { 47 unsigned int word =vec[n/Getbitsperint()]; 48 return (word &= (1 << (n % Getbitsperint())))!=0; 49 } 50 51 //////////////////////////////////////// Fold(vector<unsigned int> & vec,unsigned int nbits)52 void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits) 53 { 54 if(nbits<Getbitsperint()) 55 { 56 stringstream ss; 57 ss << "Can't fold to less than " << Getbitsperint() << "bits"; 58 obErrorLog.ThrowError(__FUNCTION__, ss.str(), obError); 59 return; 60 } 61 // "folding" to a larger # of bits 62 if (nbits > vec.size()*Getbitsperint()) { 63 vec.resize(nbits/Getbitsperint(), 0); 64 } 65 else { 66 // normal folding to smaller vector sizes 67 while(vec.size()*Getbitsperint()/2 >= nbits) 68 vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2, 69 vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end()); 70 } 71 } 72 73 //////////////////////////////////////// 74 /* bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt) 75 { 76 Fptpos iter; 77 if(id.empty()) 78 iter=FPtsMap().begin(); 79 else 80 { 81 iter=FPtsMap().find(id); 82 if(iter!=FPtsMap().end()) 83 ++iter; 84 } 85 if(iter==FPtsMap().end()) 86 return false; 87 id = iter->first; 88 pFPrt = iter->second; 89 return true; 90 } 91 92 OBFingerprint* OBFingerprint::FindFingerprint(const string& ID) 93 { 94 if(ID.empty()) 95 return _pDefault; 96 Fptpos iter = FPtsMap().find(ID); 97 if(iter==FPtsMap().end()) 98 return NULL; 99 else 100 return iter->second; 101 } 102 */ Tanimoto(const vector<unsigned int> & vec1,const vector<unsigned int> & vec2)103 double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2) 104 { 105 //Independent of sizeof(unsigned int) 106 if(vec1.size()!=vec2.size()) 107 return -1; //different number of bits 108 int andbits=0, orbits=0; 109 for (unsigned i=0;i<vec1.size();++i) 110 { 111 int andfp = vec1[i] & vec2[i]; 112 int orfp = vec1[i] | vec2[i]; 113 //Count bits 114 /* GCC 3.4 supports a "population count" builtin, which on many targets is 115 implemented with a single instruction. There is a fallback definition 116 in libgcc in case a target does not have one, which should be just as 117 good as the static function below. */ 118 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) 119 andbits += __builtin_popcount(andfp); 120 orbits += __builtin_popcount(orfp); 121 #else 122 for(;andfp;andfp=andfp<<1) 123 if(andfp<0) ++andbits; 124 for(;orfp;orfp=orfp<<1) 125 if(orfp<0) ++orbits; 126 #endif 127 } 128 if(orbits==0) 129 return 0.0; 130 return((double)andbits/(double)orbits); 131 } 132 133 //***************************************************************** Find(OBBase * pOb,vector<unsigned long> & SeekPositions,unsigned int MaxCandidates)134 bool FastSearch::Find(OBBase* pOb, vector<unsigned long>& SeekPositions, 135 unsigned int MaxCandidates) 136 { 137 ///Finds chemical objects in datafilename (which must previously have been indexed) 138 ///that have all the same bits set in their fingerprints as in the fingerprint of 139 ///a pattern object. (Usually a substructure search in molecules.) 140 ///The type of fingerprint and its degree of folding does not have to be specified 141 ///here because the values in the index file are used. 142 ///The positions of the candidate matching molecules in the original datafile are returned. 143 144 vector<unsigned int> vecwords; 145 _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint()); 146 147 vector<unsigned int>candidates; //indices of matches from fingerprint screen 148 candidates.reserve(MaxCandidates); 149 150 unsigned int dataSize = _index.header.nEntries; 151 // GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype); 152 153 unsigned int words = _index.header.words; 154 unsigned int* nextp = &_index.fptdata[0]; 155 unsigned int* ppat0 = &vecwords[0]; 156 unsigned int* p; 157 unsigned int* ppat; 158 unsigned int i; 159 for(i=0;i<dataSize; ++i) //speed critical section 160 { 161 p=nextp; 162 nextp += words; 163 ppat=ppat0; 164 bool ppat_has_additional_bits = false; 165 while(p<nextp) 166 { 167 if ((*ppat & *p) ^ *ppat) { // any bits in ppat that are not in p? 168 ppat_has_additional_bits = true; 169 break; 170 } 171 p++; 172 ppat++; 173 } 174 if(!ppat_has_additional_bits) 175 { 176 candidates.push_back(i); 177 if(candidates.size()>=MaxCandidates) 178 break; 179 } 180 } 181 182 if(i<_index.header.nEntries) //premature end to search 183 { 184 stringstream errorMsg; 185 errorMsg << "Stopped looking after " << i << " molecules." << endl; 186 obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning); 187 } 188 189 vector<unsigned int>::iterator itr; 190 for(itr=candidates.begin();itr!=candidates.end();++itr) 191 { 192 SeekPositions.push_back(_index.seekdata[*itr]); 193 } 194 return true; 195 } 196 197 //////////////////////////////////////////////////////////// FindMatch(OBBase * pOb,vector<unsigned long> & SeekPositions,unsigned int MaxCandidates)198 bool FastSearch::FindMatch(OBBase* pOb, vector<unsigned long>& SeekPositions, 199 unsigned int MaxCandidates) 200 { 201 //Similar to FastSearch::Find() except that successful candidates have all bits the same as the target 202 vector<unsigned int> vecwords; 203 _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint()); 204 205 vector<unsigned int>candidates; //indices of matches from fingerprint screen 206 207 unsigned int dataSize = _index.header.nEntries; 208 unsigned int words = _index.header.words; 209 unsigned int* nextp = &_index.fptdata[0]; // start of next FP in index 210 unsigned int* ppat0 = &vecwords[0]; // start of target FP 211 unsigned int* p; // current position in index 212 unsigned int* ppat; // current position in target FP 213 unsigned int i; // need address of this, can't be register 214 for(i=0;i<dataSize; ++i) //speed critical section 215 { 216 p=nextp; 217 nextp += words; 218 ppat=ppat0; 219 220 while((*p++ == *ppat++ ) && (p<nextp)); 221 222 if(p==nextp) 223 { 224 candidates.push_back(i); 225 if(candidates.size()>=MaxCandidates) 226 break; 227 } 228 } 229 230 vector<unsigned int>::iterator itr; 231 for(itr=candidates.begin();itr!=candidates.end();++itr) 232 { 233 SeekPositions.push_back(_index.seekdata[*itr]); 234 } 235 return true; 236 } 237 238 ///////////////////////////////////////////////////////// FindSimilar(OBBase * pOb,multimap<double,unsigned long> & SeekposMap,double MinTani,double MaxTani)239 bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned long>& SeekposMap, 240 double MinTani, double MaxTani) 241 { 242 vector<unsigned int> targetfp; 243 _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint()); 244 245 unsigned int words = _index.header.words; 246 unsigned int dataSize = _index.header.nEntries; 247 unsigned int* nextp = &_index.fptdata[0]; 248 unsigned int* p; 249 unsigned int i; 250 for(i=0;i<dataSize; ++i) //speed critical section 251 { 252 p=nextp; 253 nextp += words; 254 double tani = OBFingerprint::Tanimoto(targetfp,p); 255 if(tani>MinTani && tani < MaxTani) 256 SeekposMap.insert(pair<const double, unsigned long>(tani,_index.seekdata[i])); 257 } 258 return true; 259 } 260 261 ///////////////////////////////////////////////////////// FindSimilar(OBBase * pOb,multimap<double,unsigned long> & SeekposMap,int nCandidates)262 bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned long>& SeekposMap, 263 int nCandidates) 264 { 265 ///If nCandidates is zero or omitted the original size of the multimap is used 266 if(nCandidates) 267 { 268 //initialise the multimap with nCandidate zero entries 269 SeekposMap.clear(); 270 int i; 271 for(i=0;i<nCandidates;++i) 272 SeekposMap.insert(pair<const double, unsigned long>(0,0)); 273 } 274 else if(SeekposMap.size()==0) 275 return false; 276 277 vector<unsigned int> targetfp; 278 _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint()); 279 280 unsigned int words = _index.header.words; 281 unsigned int dataSize = _index.header.nEntries; 282 unsigned int* nextp = &_index.fptdata[0]; 283 unsigned int* p; 284 unsigned int i; 285 for(i=0;i<dataSize; ++i) //speed critical section 286 { 287 p=nextp; 288 nextp += words; 289 double tani = OBFingerprint::Tanimoto(targetfp,p); 290 if(tani>SeekposMap.begin()->first) 291 { 292 SeekposMap.insert(pair<const double, unsigned long>(tani,_index.seekdata[i])); 293 SeekposMap.erase(SeekposMap.begin()); 294 } 295 } 296 return true; 297 } 298 299 ///////////////////////////////////////////////////////// ReadIndex(istream * pIndexstream)300 string FastSearch::ReadIndex(istream* pIndexstream) 301 { 302 //Reads fs index from istream into member variables 303 _index.Read(pIndexstream); 304 305 _pFP = _index.CheckFP(); 306 if(!_pFP) 307 *(_index.header.datafilename) = '\0'; 308 309 return _index.header.datafilename; //will be empty on error 310 } 311 312 ////////////////////////////////////////////////////////// ReadIndexFile(string IndexFilename)313 string FastSearch::ReadIndexFile(string IndexFilename) 314 { 315 ifstream ifs(IndexFilename.c_str(),ios::binary); 316 if(ifs) 317 return ReadIndex(&ifs); 318 else 319 { 320 string dum; 321 return dum; 322 } 323 } 324 325 ////////////////////////////////////////////////////////// Read(istream * pIndexstream)326 bool FptIndex::Read(istream* pIndexstream) 327 { 328 // pIndexstream->read((char*)&(header), sizeof(FptIndexHeader)); 329 // pIndexstream->seekg(header.headerlength);//allows header length to be changed 330 331 if(!ReadHeader(pIndexstream)) 332 { 333 *(header.datafilename) = '\0'; 334 return false; 335 } 336 337 unsigned long nwords = header.nEntries * header.words; 338 fptdata.resize(nwords); 339 seekdata.resize(header.nEntries); 340 341 pIndexstream->read((char*)&(fptdata[0]), sizeof(unsigned int) * nwords); 342 if(header.seek64) 343 { 344 pIndexstream->read((char*)&(seekdata[0]), sizeof(unsigned long) * header.nEntries); 345 } 346 else 347 { //legacy format 348 vector<unsigned int> tmp(header.nEntries); 349 pIndexstream->read((char*)&(tmp[0]), sizeof(unsigned int) * header.nEntries); 350 std::copy(tmp.begin(),tmp.end(),seekdata.begin()); 351 } 352 353 if(pIndexstream->fail()) 354 { 355 *(header.datafilename) = '\0'; 356 return false; 357 } 358 return true; 359 } 360 361 ////////////////////////////////////////////////////////// ReadHeader(istream * pIndexstream)362 bool FptIndex::ReadHeader(istream* pIndexstream) 363 { 364 pIndexstream->read( (char*)&header.headerlength, sizeof(unsigned) ); 365 pIndexstream->read( (char*)&header.nEntries, sizeof(unsigned) ); 366 pIndexstream->read( (char*)&header.words, sizeof(unsigned) ); 367 pIndexstream->read( (char*)&header.fpid, sizeof(header.fpid) ); 368 pIndexstream->read( (char*)&header.seek64, sizeof(header.seek64) ); 369 pIndexstream->read( (char*)&header.datafilename, sizeof(header.datafilename) ); 370 return !pIndexstream->fail(); 371 } 372 373 ////////////////////////////////////////////////////////// CheckFP()374 OBFingerprint* FptIndex::CheckFP() 375 { 376 //check that fingerprint type is available 377 OBFingerprint* pFP = OBFingerprint::FindFingerprint(header.fpid); 378 if(!pFP) 379 { 380 stringstream errorMsg; 381 errorMsg << "Index has Fingerprints of type '" << header.fpid 382 << " which is not currently loaded." << endl; 383 obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); 384 } 385 return pFP; //NULL if not available 386 } 387 388 //******************************************************* FastSearchIndexer(string & datafilename,ostream * os,std::string & fpid,int FptBits,int nmols)389 FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os, 390 std::string& fpid, int FptBits, int nmols) 391 { 392 ///Starts indexing process 393 _indexstream = os; 394 _nbits=FptBits; 395 _pindex= new FptIndex; 396 _pindex->header.headerlength = 3*sizeof(unsigned)+sizeof(_pindex->header.fpid) 397 +sizeof(_pindex->header.datafilename); 398 strncpy(_pindex->header.fpid,fpid.c_str(),15); 399 _pindex->header.fpid[14]='\0'; //ensure fpid is terminated at 14 characters. 400 _pindex->header.seek64 = 1; 401 strncpy(_pindex->header.datafilename, datafilename.c_str(), 255); 402 403 //just a hint to reserve size of vectors; definitive value set in destructor 404 _pindex->header.nEntries = nmols; 405 406 //check that fingerprint type is available 407 _pFP = _pindex->CheckFP(); 408 if(fpid.empty()) // add id of default FP 409 strcpy(_pindex->header.fpid, _pFP->GetID()); 410 411 //Save a small amount of time by not generating info (FP2 currently) 412 _pFP->SetFlags(_pFP->Flags() | OBFingerprint::FPT_NOINFO); 413 } 414 415 ///////////////////////////////////////////////////////////// FastSearchIndexer(FptIndex * pindex,std::ostream * os,int nmols)416 FastSearchIndexer::FastSearchIndexer(FptIndex* pindex, std::ostream* os, int nmols) 417 { 418 //nmols is new total number of molecules 419 _indexstream = os; 420 _pindex = pindex; 421 _nbits = _pindex->header.words * OBFingerprint::Getbitsperint(); 422 423 //just a hint to reserve size of vectors; definitive value set in destructor 424 _pindex->header.nEntries = nmols; 425 426 //check that fingerprint type is available 427 _pFP = _pindex->CheckFP(); 428 } 429 430 ///////////////////////////////////////////////////////////// ~FastSearchIndexer()431 FastSearchIndexer::~FastSearchIndexer() 432 { 433 ///Saves index file 434 FptIndexHeader& hdr = _pindex->header; 435 hdr.nEntries = _pindex->seekdata.size(); 436 //Write header 437 //_indexstream->write((const char*)&hdr, sizeof(FptIndexHeader)); 438 _indexstream->write( (const char*)&hdr.headerlength, sizeof(unsigned) ); 439 _indexstream->write( (const char*)&hdr.nEntries, sizeof(unsigned) ); 440 _indexstream->write( (const char*)&hdr.words, sizeof(unsigned) ); 441 _indexstream->write( (const char*)&hdr.fpid, sizeof(hdr.fpid) ); 442 _indexstream->write( (const char*)&hdr.seek64, sizeof(hdr.seek64) ); 443 _indexstream->write( (const char*)&hdr.datafilename, sizeof(hdr.datafilename) ); 444 445 _indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int)); 446 _indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned long)); 447 if(!_indexstream) 448 obErrorLog.ThrowError(__FUNCTION__, 449 "Difficulty writing index", obWarning); 450 delete _pindex; 451 452 _pFP->SetFlags(_pFP->Flags() & ~OBFingerprint::FPT_NOINFO); //Clear 453 } 454 455 /////////////////////////////////////////////////////////////// Add(OBBase * pOb,std::streampos seekpos)456 bool FastSearchIndexer::Add(OBBase* pOb, std::streampos seekpos) 457 { 458 ///Adds a fingerprint 459 460 vector<unsigned int> vecwords; 461 if(!_pFP) 462 return false; 463 if(_pFP->GetFingerprint(pOb, vecwords, _nbits)) 464 { 465 _pindex->header.words = vecwords.size(); //Use size as returned from fingerprint 466 if(_pindex->fptdata.empty() && _pindex->header.nEntries!=0) 467 { 468 //Reserve size of vectors at start to avoid multiple realloction and copying later. 469 //Done here rather than in constructor because needs the size of the fingerprint. 470 _pindex->fptdata.reserve(_pindex->header.nEntries * _pindex->header.words); 471 _pindex->seekdata.reserve(_pindex->header.nEntries); 472 } 473 for(unsigned int i=0;i<_pindex->header.words;++i) 474 _pindex->fptdata.push_back(vecwords[i]); 475 _pindex->seekdata.push_back(seekpos); 476 return true; 477 } 478 obErrorLog.ThrowError(__FUNCTION__, "Failed to make a fingerprint", obWarning); 479 return false; 480 } 481 482 /*! 483 \class OBFingerprint fingerprint.h <openbabel/fingerprint.h> 484 These fingerprints are condensed representation of molecules (or other objects) 485 as a list of boolean values (actually bits in a vector<unsigned>) with length 486 of a power of 2. The main motivation is for fast searching of data sources 487 containing large numbers of molecules (up to several million). Open Babel 488 provides some routines which can search text files containing lists of molecules 489 in any format. See the documentation on the class FastSearch. 490 491 There are descriptions of molecular fingerprints at <br> 492 http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br> 493 http://www.mesaac.com/Fingerprint.htm <br> 494 Many methods of preparing fingerprints have been described, but the type supported 495 currently in OpenBabel has each bit representing a substructure (or other 496 molecular property). If a substructure is present in the molecule, then a 497 particular bit is set to 1. But because the hashing method may also map other 498 substructures to the same bit, a match does not guarantee that a particular 499 substructure is present; there may be false positives. However, with proper design, 500 a large fraction of irrelevant molecules in a data set can be eliminated in a 501 fast search with boolean methods on the fingerprints. 502 It then becomes feasible to make a definitive substructure search by 503 conventional methods on this reduced list even if it is slow. 504 505 OpenBabel provides a framework for applying new types of fingerprints without 506 changing any existing code. They are derived from OBFingerprint and the 507 source file is just compiled with the rest of OpenBabel. Alternatively, 508 they can be separately compiled as a DLL or shared library and discovered 509 when OpenBabel runs. 510 511 For more on these specific implementations of fingerprints in Open 512 Babel, please take a look at the developer's wiki: 513 http://openbabel.org/wiki/Fingerprints 514 515 Fingerprints derived from this abstract base class OBFingerprint can be for any 516 object derived from OBBase (not just for OBMol). 517 Each derived class provides an ID as a string and OBFingerprint keeps a map of 518 these to provides a pointer to the class when requested in FindFingerprint. 519 520 <h4>-- To define a fingerprint type --</h4> 521 The classes derived form OBFingerprint are required to provide 522 a GetFingerprint() routine and a Description() routine 523 \code 524 class MyFpType : OBFingerprint 525 { 526 MyFpType(const char* id) : OBFingerprint(id){}; 527 528 virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits) 529 { 530 //Convert pOb to the required type, usually OBMol 531 OBMol* pmol = dynamic_cast<OBMol*>(pOb); 532 fp.resize(required_number_of_words); 533 ... 534 use SetBit(fp,n); to set the nth bit 535 536 if(nbits) 537 Fold(fp, nbits); 538 } 539 540 virtual const char* Description(){ return "Some descriptive text";} 541 ... 542 }; 543 \endcode 544 545 Declare a global instance with the ID you will use in -f options to specify 546 its use. 547 \code 548 MyFpType theMyFpType("myfpID"); 549 \endcode 550 551 <h4>-- To obtain a fingerprint --</h4> 552 \code 553 OBMol mol; 554 ... 555 vector<unsigned int> fp; 556 OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint 557 \endcode 558 or 559 \code 560 vector<unsigned int> fp; 561 OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID"); 562 ...and maybe... 563 pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger 564 \endcode 565 566 <h4>-- To print a list of available fingerprint types --</h4> 567 \code 568 std::string id; 569 OBFingerPrint* pFPrt=NULL; 570 while(OBFingerprint::GetNextFPrt(id, pFPrt)) 571 { 572 cout << id << " -- " << pFPrt->Description() << endl; 573 } 574 \endcode 575 576 Fingerprints are handled as vector<unsigned int> so that the number of bits 577 in this vector and their order will be platform and compiler 578 dependent, because of size of int types and endian differences. 579 Use fingerprints (and fastsearch indexes containing them) only 580 for comparing with other fingerprints prepared on the same machine. 581 582 The FingerprintFormat class is an output format which displays fingerprints 583 as hexadecimal. When multiple molecules are supplied it will calculate the 584 Tanimoto coefficient from the first molecule to each of the others. It also 585 shows whether the first molecule is a possible substructure to all the others, 586 i.e. whether all the bits set in the fingerprint for the first molecule are 587 set in the fingerprint of the others. To display hexadecimal information when 588 multiple molecules are provided it is necessay to use the -xh option. 589 590 To see a list of available format types, type obabel -F on the command line. 591 The -xF option of the FingerprintFormat class also provides this output, but due 592 to a quirk in the way the program works, it is necessary to have a valid input 593 molecule for this option to work. 594 */ 595 596 /*! \class FastSearch fingerprint.h <openbabel/fingerprint.h> 597 The FastSearch class searches an index to a datafile containing a list of molecules 598 (or other objects) prepared by FastSearchIndexer. 599 600 OpenBabel can also search files for molecules containing a substructure specified 601 by a SMARTS string, using OBSmartsPattern or from the command line: 602 \code 603 obabel datafile.xxx -O outfile.yyy -sSMARTS 604 \endcode 605 But when the data file contains more than about 10,000 molecules this becomes 606 rather too slow to be used interactively. To do it more quickly, an index 607 of the molecules containing their fingerprints (see OBFingerprint) is prepared using 608 FastSearchIndexer. The indexing process may take a long time but only has to 609 be done once. The index can be searched very quickly with FastSearch. Because 610 of the nature of fingerprints a match to a bit does not guarantee 611 the presence of a particular substructure or other molecular property, so that 612 a definitive answer may require a subsequent search of the (much reduced) list 613 of candidates by another method (like OBSmartsPattern). 614 615 Note that the index files are not portable. They need to be prepared on the 616 computer that will access them. 617 618 <h4>Using FastSearch and FastSearchIndexer in a program</h4> 619 620 The index has two tables: 621 - an array of fingerprints of the molecules 622 - an array of the seek positions in the datasource file of all the molecules 623 624 <h4>To prepare an fastsearch index file:</h4> 625 - Open an ostream to the index file. 626 - Make a FastSearchIndexer object on the heap or the stack, passing in as parameters: 627 - the datafile name, the indexfile stream, 628 - the id of the fingerprint type to be used, 629 - the number of bits the fingerprint is to be folded down to, If it is to be left 630 unfolded, set fpid to 0 or do not specify it. 631 . 632 - For each molecule, call Add() with its pointer and its position in the datafile.<br> 633 Currently the std::streampos value is implicitly cast to unsigned int so that 634 for 32bit machines the datafile has to be no longer than about 2Gbyte. 635 - The index file is written when the FastSearchIndexer object is deleted or goes out 636 of scope. 637 638 <h4>To search in a fastsearch index file</h4> 639 640 - Open a std::istream to the indexfile (in binary mode on some systems) 641 - Make a FastSearch object, read the index and open the datafile whose 642 name it provides 643 \code 644 ifstream ifs(indexname,ios::binary); 645 FastSearch fs; 646 string datafilename = fs.ReadIndex(&ifs); 647 if(datafilename.empty() 648 return false; 649 650 ifstream datastream(datafilename); 651 if(!datastream) 652 return false; 653 \endcode 654 655 <strong>To do a search for molecules which have all the substructure bits the 656 OBMol object, patternMol</strong> 657 \code 658 vector<unsigned int>& SeekPositions; 659 if(!fs.Find(patternMol, SeekPositions, MaxCandidates)) 660 for(itr=SeekPositions.begin();itr!=SeekPositions.end();++itr) 661 { 662 datastream.seekg(*itr); 663 ... read the candidate molecule 664 and subject to more rigorous test if necessary 665 } 666 \endcode 667 668 <strong>To do a similarity search based on the Tanimoto coefficient</strong> 669 This is defined as: <br> 670 <em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br> 671 where the boolean operations between the fingerprints are bitwise<br> 672 The Tanimoto coefficient has no absolute meaning and depends on 673 the design of the fingerprint. 674 \code 675 multimap<double, unsigned int> SeekposMap; 676 // to find n best molecules 677 fs.FindSimilar(&patternMol, SeekposMap, n); 678 ...or 679 // to find molecules with Tanimoto coefficient > MinTani 680 fs.FindSimilar(&patternMol, SeekposMap, MinTani); 681 682 multimap<double, unsigned int>::reverse_iterator itr; 683 for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr) 684 { 685 datastream.seekg(itr->second); 686 // ... read the candidate molecule 687 double tani = itr->first; 688 } 689 \endcode 690 691 The FastSearchFormat class facilitates the use of these routine from the 692 command line or other front end program. For instance: 693 694 <strong>Prepare an index:</strong> 695 \code 696 obabel datafile.xxx -O index.fs 697 \endcode 698 With options you can specify: 699 - which type of fingerprint to be used, e.g. -xfFP2, 700 - whether it is folded to a specified number of bits, e.g. -xn128 701 (which should be a power of 2) 702 - whether to pre-select the molecules which are indexed: 703 - by structure e.g only ethers and esters, -sCOC 704 - by excluding molecules with bezene rings, -vc1ccccc1 705 - by position in the datafile e.g. only mols 10 to 90, -f10 -l90 706 . 707 <strong>Substructure search</strong> in a previously prepared index file 708 \code 709 obabel index.fs -O outfile.yyy -sSMILES 710 \endcode 711 The datafile can also be used as the input file, provided the input format is specified as fs 712 \code 713 obabel datafile.xxx -O outfile.yyy -sSMILES -ifs 714 \endcode 715 A "slow" search not using fingerprints would be done on the same data by omitting -ifs. 716 A "slow" search can use SMARTS strings, but the fast search is restricted 717 to the subset which is valid SMILES. 718 719 With the -S option, the target can be specified as a molecule in a file of any format 720 \code 721 obabel datafile.xxx -O outfile.yyy -Spattern_mol.zzz -ifs 722 \endcode 723 These searches have two stages: a fingerprint search which produces 724 a number of candidate molecules and a definitive search which selects 725 from these using SMARTS pattern matching. The maximum number of candidates 726 is 4000 by default but you can change this with an option 727 e.g. -al 8000 (Note that you need the space between l and the number.) 728 If the fingerprint search reaches the maximum number it will not 729 look further and will tell you at what stage it stopped. 730 731 <strong>Similarity search</strong> in a previously prepared index file<br> 732 This rather done (rather than a substructure search) if the -at option is used, 733 \code 734 obabel datafile.xxx -O outfile.yyy -sSMILES -ifs -at0.7 735 \endcode 736 for instance 737 - -at0.7 will recover all molecules with Tanimoto greater than 0.7 738 - -at15 (no decimal point) will recover the 15 molecules with largest coefficients. 739 - -aa will add the Tanimoto coefficient to the titles of the output molecules. 740 741 All stages, the indexing, the interpretation of the SMILES string in the -s option, 742 the file in the -S option and the final SMARTS filter convert to OBMol and apply 743 ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O and N(=O)=O 744 are recognized as chemically identical. 745 */ 746 747 }//Openbabel 748 749 //! \file fingerprint.cpp 750 //! \brief Definitions for OBFingerprint base class and fastsearch classes 751