1""" Pandas extension for chemical analysis """ 2from __future__ import absolute_import 3from collections import deque 4from six import BytesIO, StringIO, text_type 5import pandas as pd 6 7import oddt 8 9pd.set_option("display.max_colwidth", 999999) 10 11 12def _mol_reader(fmt='sdf', 13 filepath_or_buffer=None, 14 usecols=None, 15 molecule_column='mol', 16 molecule_name_column='mol_name', 17 smiles_column=None, 18 skip_bad_mols=False, 19 chunksize=None, 20 **kwargs): 21 """Universal reading function for private use. 22 23 .. versionadded:: 0.3 24 25 Parameters 26 ---------- 27 fmt : string 28 The format of molecular file 29 30 filepath_or_buffer : string or None 31 File path 32 33 usecols : list or None, optional (default=None) 34 A list of columns to read from file. If None then all available 35 fields are read. 36 37 molecule_column : string or None, optional (default='mol') 38 Name of molecule column. If None the molecules will be skipped and 39 the reading will be speed up significantly. 40 41 molecule_name_column : string or None, optional (default='mol_name') 42 Column name which will contain molecules' title/name. Column is 43 skipped when set to None. 44 45 smiles_column : string or None, optional (default=None) 46 Column name containg molecules' SMILES, by default it is disabled. 47 48 skip_bad_mols : bool, optional (default=False) 49 Switch to skip empty (bad) molecules. Useful for RDKit, which Returns 50 None if molecule can not sanitize. 51 52 chunksize : int or None, optional (default=None) 53 Size of chunk to return. If set to None whole set is returned. 54 55 Returns 56 ------- 57 chunk : 58 A `ChemDataFrame` containg `chunksize` molecules. 59 60 """ 61 # capture options for reader 62 reader_kwargs = {} 63 if 'opt' in kwargs: 64 reader_kwargs['opt'] = kwargs.pop('opt') 65 if 'sanitize' in kwargs: 66 reader_kwargs['sanitize'] = kwargs.pop('sanitize') 67 68 # when you dont read molecules you can skip parsing them 69 if molecule_column is None: 70 if oddt.toolkit.backend == 'ob' and fmt == 'sdf': 71 if 'opt' in reader_kwargs: 72 reader_kwargs['opt']['P'] = None 73 else: 74 reader_kwargs['opt'] = {'P': None} 75 elif oddt.toolkit.backend == 'rdk': 76 reader_kwargs['sanitize'] = False 77 78 chunk = [] 79 for n, mol in enumerate(oddt.toolkit.readfile(fmt, filepath_or_buffer, 80 **reader_kwargs)): 81 if skip_bad_mols and mol is None: 82 continue # add warning with number of skipped molecules 83 if usecols is None: 84 mol_data = mol.data.to_dict() 85 else: 86 mol_data = dict((k, mol.data[k]) for k in usecols) 87 88 if molecule_column: 89 mol_data[molecule_column] = mol 90 if molecule_name_column: 91 mol_data[molecule_name_column] = mol.title 92 if smiles_column: 93 mol_data[smiles_column] = mol.smiles 94 chunk.append(mol_data) 95 if chunksize and (n + 1) % chunksize == 0: 96 chunk_frm = ChemDataFrame(chunk, **kwargs) 97 chunk_frm._molecule_column = molecule_column 98 yield chunk_frm 99 chunk = [] 100 if chunk or chunksize is None: 101 chunk_frm = ChemDataFrame(chunk, **kwargs) 102 chunk_frm._molecule_column = molecule_column 103 yield chunk_frm 104 105 106def _mol_writer(data, 107 fmt='sdf', 108 filepath_or_buffer=None, 109 update_properties=True, 110 molecule_column=None, 111 columns=None): 112 """Universal writing function for private use. 113 114 .. versionadded:: 0.3 115 116 Parameters 117 ---------- 118 fmt : string 119 The format of molecular file 120 121 filepath_or_buffer : string or None 122 File path 123 124 update_properties : bool, optional (default=True) 125 Switch to update properties from the DataFrames to the molecules 126 while writting. 127 128 molecule_column : string or None, optional (default='mol') 129 Name of molecule column. If None the molecules will be skipped. 130 131 columns : list or None, optional (default=None) 132 A list of columns to write to file. If None then all available 133 fields are written. 134 135 """ 136 if filepath_or_buffer is None: 137 out = StringIO() 138 elif hasattr(filepath_or_buffer, 'write'): 139 out = filepath_or_buffer 140 else: 141 out = oddt.toolkit.Outputfile(fmt, filepath_or_buffer, overwrite=True) 142 if isinstance(data, pd.DataFrame): 143 molecule_column = molecule_column or data._molecule_column 144 for ix, row in data.iterrows(): 145 mol = row[molecule_column].clone 146 if update_properties: 147 new_data = row.to_dict() 148 del new_data[molecule_column] 149 mol.data.update(new_data) 150 if columns: 151 for k in mol.data.keys(): 152 if k not in columns: 153 del mol.data[k] 154 if filepath_or_buffer is None or hasattr(filepath_or_buffer, 'write'): 155 out.write(mol.write(fmt)) 156 else: 157 out.write(mol) 158 elif isinstance(data, pd.Series): 159 for mol in data: 160 if filepath_or_buffer is None or hasattr(filepath_or_buffer, 'write'): 161 out.write(mol.write(fmt)) 162 else: 163 out.write(mol) 164 if filepath_or_buffer is None: 165 return out.getvalue() 166 elif not hasattr(filepath_or_buffer, 'write'): # dont close foreign buffer 167 out.close() 168 169 170def read_csv(*args, **kwargs): 171 """ TODO: Support Chunks """ 172 smiles_to_molecule = kwargs.pop('smiles_to_molecule', None) 173 molecule_column = kwargs.pop('molecule_column', 'mol') 174 data = pd.read_csv(*args, **kwargs) 175 if smiles_to_molecule is not None: 176 data[molecule_column] = data[smiles_to_molecule].map( 177 lambda x: oddt.toolkit.readstring('smi', x)) 178 return data 179 180 181def read_sdf(filepath_or_buffer=None, 182 usecols=None, 183 molecule_column='mol', 184 molecule_name_column='mol_name', 185 smiles_column=None, 186 skip_bad_mols=False, 187 chunksize=None, 188 **kwargs): 189 """Read SDF/MDL multi molecular file to ChemDataFrame 190 191 .. versionadded:: 0.3 192 193 Parameters 194 ---------- 195 filepath_or_buffer : string or None 196 File path 197 198 usecols : list or None, optional (default=None) 199 A list of columns to read from file. If None then all available 200 fields are read. 201 202 molecule_column : string or None, optional (default='mol') 203 Name of molecule column. If None the molecules will be skipped and 204 the reading will be speed up significantly. 205 206 molecule_name_column : string or None, optional (default='mol_name') 207 Column name which will contain molecules' title/name. Column is 208 skipped when set to None. 209 210 smiles_column : string or None, optional (default=None) 211 Column name containg molecules' SMILES, by default it is disabled. 212 213 skip_bad_mols : bool, optional (default=False) 214 Switch to skip empty (bad) molecules. Useful for RDKit, which Returns 215 None if molecule can not sanitize. 216 217 chunksize : int or None, optional (default=None) 218 Size of chunk to return. If set to None whole set is returned. 219 220 Returns 221 ------- 222 result : 223 A `ChemDataFrame` containg all molecules if `chunksize` is None 224 or genrerator of `ChemDataFrame` with `chunksize` molecules. 225 226 """ 227 result = _mol_reader(fmt='sdf', 228 filepath_or_buffer=filepath_or_buffer, 229 usecols=usecols, 230 molecule_column=molecule_column, 231 molecule_name_column=molecule_name_column, 232 smiles_column=smiles_column, 233 skip_bad_mols=skip_bad_mols, 234 chunksize=chunksize, 235 **kwargs) 236 if chunksize: 237 return result 238 else: 239 return deque(result, maxlen=1).pop() 240 241 242def read_mol2(filepath_or_buffer=None, 243 usecols=None, 244 molecule_column='mol', 245 molecule_name_column='mol_name', 246 smiles_column=None, 247 skip_bad_mols=False, 248 chunksize=None, 249 **kwargs): 250 """Read Mol2 multi molecular file to ChemDataFrame. UCSF Dock 6 comments 251 style is supported, i.e. `#### var_name: value` before molecular block. 252 253 .. versionadded:: 0.3 254 255 Parameters 256 ---------- 257 filepath_or_buffer : string or None 258 File path 259 260 usecols : list or None, optional (default=None) 261 A list of columns to read from file. If None then all available 262 fields are read. 263 264 molecule_column : string or None, optional (default='mol') 265 Name of molecule column. If None the molecules will be skipped and 266 the reading will be speed up significantly. 267 268 molecule_name_column : string or None, optional (default='mol_name') 269 Column name which will contain molecules' title/name. Column is 270 skipped when set to None. 271 272 smiles_column : string or None, optional (default=None) 273 Column name containg molecules' SMILES, by default it is disabled. 274 275 skip_bad_mols : bool, optional (default=False) 276 Switch to skip empty (bad) molecules. Useful for RDKit, which Returns 277 None if molecule can not sanitize. 278 279 chunksize : int or None, optional (default=None) 280 Size of chunk to return. If set to None whole set is returned. 281 282 Returns 283 ------- 284 result : 285 A `ChemDataFrame` containg all molecules if `chunksize` is None 286 or genrerator of `ChemDataFrame` with `chunksize` molecules. 287 288 """ 289 result = _mol_reader(fmt='mol2', 290 filepath_or_buffer=filepath_or_buffer, 291 usecols=usecols, 292 molecule_column=molecule_column, 293 molecule_name_column=molecule_name_column, 294 smiles_column=smiles_column, 295 skip_bad_mols=skip_bad_mols, 296 chunksize=chunksize, 297 **kwargs) 298 if chunksize: 299 return result 300 else: 301 return deque(result, maxlen=1).pop() 302 303 304class ChemSeries(pd.Series): 305 """Pandas Series modified to adapt `oddt.toolkit.Molecule` objects and apply 306 molecular methods easily. 307 308 .. versionadded:: 0.3 309 """ 310 def __le__(self, other): 311 """ Substructure searching. 312 `chemseries < mol`: are molecules in series substructures of a `mol` 313 """ 314 if (isinstance(other, oddt.toolkit.Molecule) and 315 isinstance(self[0], oddt.toolkit.Molecule)): 316 return self.map(lambda x: oddt.toolkit.Smarts(x.smiles).match(other)) 317 else: 318 return super(ChemSeries, self).__le__(other) 319 320 def __ge__(self, other): 321 """ Substructure searching. 322 `chemseries > mol`: is `mol` a substructure of molecules in series 323 """ 324 if (isinstance(other, oddt.toolkit.Molecule) and 325 isinstance(self[0], oddt.toolkit.Molecule)): 326 smarts = oddt.toolkit.Smarts(other.smiles) 327 return self.map(lambda x: smarts.match(x)) 328 else: 329 return super(ChemSeries, self).__ge__(other) 330 331 def __or__(self, other): 332 """ Tanimoto coefficient """ 333 if (isinstance(self[0], oddt.toolkit.Fingerprint) and 334 isinstance(other, oddt.toolkit.Fingerprint)): 335 return self.map(lambda x: x | other) 336 else: 337 return super(ChemSeries, self).__or__(other) 338 339 def calcfp(self, *args, **kwargs): 340 """Helper function to map FP calculation throuugh the series""" 341 assert(isinstance(self[0], oddt.toolkit.Molecule)) 342 return self.map(lambda x: x.calcfp(*args, **kwargs)) 343 344 def to_smiles(self, filepath_or_buffer=None): 345 return _mol_writer(self, fmt='smi', filepath_or_buffer=filepath_or_buffer) 346 347 def to_sdf(self, filepath_or_buffer=None): 348 return _mol_writer(self, fmt='sdf', filepath_or_buffer=filepath_or_buffer) 349 350 def to_mol2(self, filepath_or_buffer=None): 351 return _mol_writer(self, fmt='mol2', filepath_or_buffer=filepath_or_buffer) 352 353 @property 354 def _constructor(self): 355 """ Force new class to be usead as constructor """ 356 return ChemSeries 357 358 @property 359 def _constructor_expanddim(self): 360 """ Force new class to be usead as constructor when expandig dims """ 361 return ChemDataFrame 362 363 364class ChemDataFrame(pd.DataFrame): 365 """Chemical DataFrame object, which contains molecules column of 366 `oddt.toolkit.Molecule` objects. Rich display of moleucles (2D) is available 367 in iPython Notebook. Additional `to_sdf` and `to_mol2` methods make writing 368 to molecular formats easy. 369 370 .. versionadded:: 0.3 371 372 Notes 373 ----- 374 Thanks to: http://blog.snapdragon.cc/2015/05/05/subclass-pandas-dataframe-to-save-custom-attributes/ 375 """ 376 _metadata = ['_molecule_column'] 377 _molecule_column = None 378 379 def to_sdf(self, 380 filepath_or_buffer=None, 381 update_properties=True, 382 molecule_column=None, 383 columns=None): 384 """Write DataFrame to SDF file. 385 386 .. versionadded:: 0.3 387 388 Parameters 389 ---------- 390 filepath_or_buffer : string or None 391 File path 392 393 update_properties : bool, optional (default=True) 394 Switch to update properties from the DataFrames to the molecules 395 while writting. 396 397 molecule_column : string or None, optional (default='mol') 398 Name of molecule column. If None the molecules will be skipped. 399 400 columns : list or None, optional (default=None) 401 A list of columns to write to file. If None then all available 402 fields are written. 403 """ 404 molecule_column = molecule_column or self._molecule_column 405 return _mol_writer(self, 406 filepath_or_buffer=filepath_or_buffer, 407 update_properties=update_properties, 408 fmt='sdf', 409 molecule_column=molecule_column, 410 columns=columns) 411 412 def to_mol2(self, 413 filepath_or_buffer=None, 414 update_properties=True, 415 molecule_column='mol', 416 columns=None): 417 """Write DataFrame to Mol2 file. 418 419 .. versionadded:: 0.3 420 421 Parameters 422 ---------- 423 filepath_or_buffer : string or None 424 File path 425 426 update_properties : bool, optional (default=True) 427 Switch to update properties from the DataFrames to the molecules 428 while writting. 429 430 molecule_column : string or None, optional (default='mol') 431 Name of molecule column. If None the molecules will be skipped. 432 433 columns : list or None, optional (default=None) 434 A list of columns to write to file. If None then all available 435 fields are written. 436 """ 437 molecule_column = molecule_column or self._molecule_column 438 return _mol_writer(self, 439 fmt='mol2', 440 filepath_or_buffer=filepath_or_buffer, 441 update_properties=update_properties, 442 molecule_column=molecule_column, 443 columns=columns) 444 445 def to_html(self, *args, **kwargs): 446 """Patched rendering in HTML - don't escape HTML inside the cells. 447 Docs are copied from parent 448 """ 449 kwargs['escape'] = False 450 return super(ChemDataFrame, self).to_html(*args, **kwargs) 451 452 def to_csv(self, *args, **kwargs): 453 """ Docs are copied from parent """ 454 if self._molecule_column and ('columns' not in kwargs or 455 kwargs['columns'] is None or 456 self._molecule_column in kwargs['columns']): 457 frm_copy = self.copy(deep=True) 458 smi = frm_copy[self._molecule_column].map(lambda x: x.smiles) 459 frm_copy[self._molecule_column] = smi 460 return super(ChemDataFrame, frm_copy).to_csv(*args, **kwargs) 461 else: 462 return super(ChemDataFrame, self).to_csv(*args, **kwargs) 463 464 def to_excel(self, *args, **kwargs): 465 """ Docs are copied from parent """ 466 467 if 'columns' in kwargs: 468 columns = kwargs['columns'] 469 else: 470 columns = self.columns.tolist() 471 472 if 'molecule_column' in kwargs: 473 molecule_column = kwargs['molecule_column'] 474 else: 475 molecule_column = self._molecule_column 476 477 molecule_column_idx = columns.index(molecule_column) 478 if 'index' not in kwargs or ('index' in kwargs and kwargs['index']): 479 molecule_column_idx += 1 480 size = kwargs.pop('size') if 'size' in kwargs else (200, 200) 481 excel_writer = args[0] 482 if isinstance(excel_writer, str): 483 excel_writer = pd.ExcelWriter(excel_writer, engine='xlsxwriter') 484 assert excel_writer.engine == 'xlsxwriter' 485 486 frm_copy = self.copy(deep=True) 487 smi = frm_copy[molecule_column].map(lambda x: x.smiles) 488 frm_copy[molecule_column] = smi 489 490 super(ChemDataFrame, frm_copy).to_excel(excel_writer, *args[1:], **kwargs) 491 492 sheet = excel_writer.sheets['Sheet1'] # TODO: Get appropriate sheet name 493 sheet.set_column(molecule_column_idx, molecule_column_idx, 494 width=size[1] / 6.) 495 for i, mol in enumerate(self[molecule_column]): 496 if mol is None: 497 continue 498 img = BytesIO() 499 png = mol.clone.write('png', size=size) 500 if isinstance(png, text_type): 501 png = png.encode('utf-8', errors='surrogateescape') 502 img.write(png) 503 sheet.write_string(i + 1, molecule_column_idx, "") 504 sheet.insert_image(i + 1, 505 molecule_column_idx, 506 'dummy', 507 {'image_data': img, 508 'positioning': 2, 509 'x_offset': 1, 510 'y_offset': 1}) 511 sheet.set_row(i + 1, height=size[0]) 512 excel_writer.save() 513 514 @property 515 def _constructor(self): 516 """ Force new class to be usead as constructor """ 517 return ChemDataFrame 518 519 @property 520 def _constructor_sliced(self): 521 """ Force new class to be usead as constructor when slicing """ 522 return ChemSeries 523 524 @property 525 def _constructor_expanddim(self): 526 """ Force new class to be usead as constructor when expandig dims """ 527 return ChemPanel 528 529 530# Copy some docscrings from upstream classes 531for method in ['to_html', 'to_csv', 'to_excel']: 532 try: 533 getattr(ChemDataFrame, method).__doc__ = getattr(pd.DataFrame, method).__doc__ 534 except AttributeError: # Python 2 compatible 535 getattr(ChemDataFrame, method).__func__.__doc__ = getattr(pd.DataFrame, method).__func__.__doc__ 536 537 538class ChemPanel(pd.Panel): 539 """Modified `pandas.Panel` to adopt higher dimension data than 540 `ChemDataFrame`. Main purpose is to store molecular fingerprints in one 541 column and keep 2D numpy array underneath. 542 543 .. versionadded:: 0.3 544 """ 545 _metadata = ['_molecule_column'] 546 _molecule_column = None 547 548 @property 549 def _constructor(self): 550 """ Force new class to be usead as constructor """ 551 return ChemPanel 552 553 @property 554 def _constructor_sliced(self): 555 """ Force new class to be usead as constructor when slicing """ 556 return ChemDataFrame 557