1""" Pandas extension for chemical analysis """
2from __future__ import absolute_import
3from collections import deque
4from six import BytesIO, StringIO, text_type
5import pandas as pd
6
7import oddt
8
9pd.set_option("display.max_colwidth", 999999)
10
11
12def _mol_reader(fmt='sdf',
13                filepath_or_buffer=None,
14                usecols=None,
15                molecule_column='mol',
16                molecule_name_column='mol_name',
17                smiles_column=None,
18                skip_bad_mols=False,
19                chunksize=None,
20                **kwargs):
21    """Universal reading function for private use.
22
23    .. versionadded:: 0.3
24
25    Parameters
26    ----------
27    fmt : string
28        The format of molecular file
29
30    filepath_or_buffer : string or None
31        File path
32
33    usecols : list or None, optional (default=None)
34        A list of columns to read from file. If None then all available
35        fields are read.
36
37    molecule_column : string or None, optional (default='mol')
38        Name of molecule column. If None the molecules will be skipped and
39        the reading will be speed up significantly.
40
41    molecule_name_column : string or None, optional (default='mol_name')
42        Column name which will contain molecules' title/name. Column is
43        skipped when set to None.
44
45    smiles_column  : string or None, optional (default=None)
46        Column name containg molecules' SMILES, by default it is disabled.
47
48    skip_bad_mols : bool, optional (default=False)
49        Switch to skip empty (bad) molecules. Useful for RDKit, which Returns
50        None if molecule can not sanitize.
51
52    chunksize : int or None, optional (default=None)
53        Size of chunk to return. If set to None whole set is returned.
54
55    Returns
56    -------
57    chunk :
58        A `ChemDataFrame` containg `chunksize` molecules.
59
60    """
61    # capture options for reader
62    reader_kwargs = {}
63    if 'opt' in kwargs:
64        reader_kwargs['opt'] = kwargs.pop('opt')
65    if 'sanitize' in kwargs:
66        reader_kwargs['sanitize'] = kwargs.pop('sanitize')
67
68    # when you dont read molecules you can skip parsing them
69    if molecule_column is None:
70        if oddt.toolkit.backend == 'ob' and fmt == 'sdf':
71            if 'opt' in reader_kwargs:
72                reader_kwargs['opt']['P'] = None
73            else:
74                reader_kwargs['opt'] = {'P': None}
75        elif oddt.toolkit.backend == 'rdk':
76            reader_kwargs['sanitize'] = False
77
78    chunk = []
79    for n, mol in enumerate(oddt.toolkit.readfile(fmt, filepath_or_buffer,
80                                                  **reader_kwargs)):
81        if skip_bad_mols and mol is None:
82            continue  # add warning with number of skipped molecules
83        if usecols is None:
84            mol_data = mol.data.to_dict()
85        else:
86            mol_data = dict((k, mol.data[k]) for k in usecols)
87
88        if molecule_column:
89            mol_data[molecule_column] = mol
90        if molecule_name_column:
91            mol_data[molecule_name_column] = mol.title
92        if smiles_column:
93            mol_data[smiles_column] = mol.smiles
94        chunk.append(mol_data)
95        if chunksize and (n + 1) % chunksize == 0:
96            chunk_frm = ChemDataFrame(chunk, **kwargs)
97            chunk_frm._molecule_column = molecule_column
98            yield chunk_frm
99            chunk = []
100    if chunk or chunksize is None:
101        chunk_frm = ChemDataFrame(chunk, **kwargs)
102        chunk_frm._molecule_column = molecule_column
103        yield chunk_frm
104
105
106def _mol_writer(data,
107                fmt='sdf',
108                filepath_or_buffer=None,
109                update_properties=True,
110                molecule_column=None,
111                columns=None):
112    """Universal writing function for private use.
113
114    .. versionadded:: 0.3
115
116    Parameters
117    ----------
118    fmt : string
119        The format of molecular file
120
121    filepath_or_buffer : string or None
122        File path
123
124    update_properties : bool, optional (default=True)
125        Switch to update properties from the DataFrames to the molecules
126        while writting.
127
128    molecule_column : string or None, optional (default='mol')
129        Name of molecule column. If None the molecules will be skipped.
130
131    columns : list or None, optional (default=None)
132        A list of columns to write to file. If None then all available
133        fields are written.
134
135    """
136    if filepath_or_buffer is None:
137        out = StringIO()
138    elif hasattr(filepath_or_buffer, 'write'):
139        out = filepath_or_buffer
140    else:
141        out = oddt.toolkit.Outputfile(fmt, filepath_or_buffer, overwrite=True)
142    if isinstance(data, pd.DataFrame):
143        molecule_column = molecule_column or data._molecule_column
144        for ix, row in data.iterrows():
145            mol = row[molecule_column].clone
146            if update_properties:
147                new_data = row.to_dict()
148                del new_data[molecule_column]
149                mol.data.update(new_data)
150            if columns:
151                for k in mol.data.keys():
152                    if k not in columns:
153                        del mol.data[k]
154            if filepath_or_buffer is None or hasattr(filepath_or_buffer, 'write'):
155                out.write(mol.write(fmt))
156            else:
157                out.write(mol)
158    elif isinstance(data, pd.Series):
159        for mol in data:
160            if filepath_or_buffer is None or hasattr(filepath_or_buffer, 'write'):
161                out.write(mol.write(fmt))
162            else:
163                out.write(mol)
164    if filepath_or_buffer is None:
165        return out.getvalue()
166    elif not hasattr(filepath_or_buffer, 'write'):  # dont close foreign buffer
167        out.close()
168
169
170def read_csv(*args, **kwargs):
171    """ TODO: Support Chunks """
172    smiles_to_molecule = kwargs.pop('smiles_to_molecule', None)
173    molecule_column = kwargs.pop('molecule_column', 'mol')
174    data = pd.read_csv(*args, **kwargs)
175    if smiles_to_molecule is not None:
176        data[molecule_column] = data[smiles_to_molecule].map(
177            lambda x: oddt.toolkit.readstring('smi', x))
178    return data
179
180
181def read_sdf(filepath_or_buffer=None,
182             usecols=None,
183             molecule_column='mol',
184             molecule_name_column='mol_name',
185             smiles_column=None,
186             skip_bad_mols=False,
187             chunksize=None,
188             **kwargs):
189    """Read SDF/MDL multi molecular file to ChemDataFrame
190
191    .. versionadded:: 0.3
192
193    Parameters
194    ----------
195    filepath_or_buffer : string or None
196        File path
197
198    usecols : list or None, optional (default=None)
199        A list of columns to read from file. If None then all available
200        fields are read.
201
202    molecule_column : string or None, optional (default='mol')
203        Name of molecule column. If None the molecules will be skipped and
204        the reading will be speed up significantly.
205
206    molecule_name_column : string or None, optional (default='mol_name')
207        Column name which will contain molecules' title/name. Column is
208        skipped when set to None.
209
210    smiles_column  : string or None, optional (default=None)
211        Column name containg molecules' SMILES, by default it is disabled.
212
213    skip_bad_mols : bool, optional (default=False)
214        Switch to skip empty (bad) molecules. Useful for RDKit, which Returns
215        None if molecule can not sanitize.
216
217    chunksize : int or None, optional (default=None)
218        Size of chunk to return. If set to None whole set is returned.
219
220    Returns
221    -------
222    result :
223        A `ChemDataFrame` containg all molecules if `chunksize` is None
224        or genrerator of `ChemDataFrame` with `chunksize` molecules.
225
226    """
227    result = _mol_reader(fmt='sdf',
228                         filepath_or_buffer=filepath_or_buffer,
229                         usecols=usecols,
230                         molecule_column=molecule_column,
231                         molecule_name_column=molecule_name_column,
232                         smiles_column=smiles_column,
233                         skip_bad_mols=skip_bad_mols,
234                         chunksize=chunksize,
235                         **kwargs)
236    if chunksize:
237        return result
238    else:
239        return deque(result, maxlen=1).pop()
240
241
242def read_mol2(filepath_or_buffer=None,
243              usecols=None,
244              molecule_column='mol',
245              molecule_name_column='mol_name',
246              smiles_column=None,
247              skip_bad_mols=False,
248              chunksize=None,
249              **kwargs):
250    """Read Mol2 multi molecular file to ChemDataFrame. UCSF Dock 6 comments
251    style is supported, i.e. `#### var_name: value` before molecular block.
252
253    .. versionadded:: 0.3
254
255    Parameters
256    ----------
257    filepath_or_buffer : string or None
258        File path
259
260    usecols : list or None, optional (default=None)
261        A list of columns to read from file. If None then all available
262        fields are read.
263
264    molecule_column : string or None, optional (default='mol')
265        Name of molecule column. If None the molecules will be skipped and
266        the reading will be speed up significantly.
267
268    molecule_name_column : string or None, optional (default='mol_name')
269        Column name which will contain molecules' title/name. Column is
270        skipped when set to None.
271
272    smiles_column  : string or None, optional (default=None)
273        Column name containg molecules' SMILES, by default it is disabled.
274
275    skip_bad_mols : bool, optional (default=False)
276        Switch to skip empty (bad) molecules. Useful for RDKit, which Returns
277        None if molecule can not sanitize.
278
279    chunksize : int or None, optional (default=None)
280        Size of chunk to return. If set to None whole set is returned.
281
282    Returns
283    -------
284    result :
285        A `ChemDataFrame` containg all molecules if `chunksize` is None
286        or genrerator of `ChemDataFrame` with `chunksize` molecules.
287
288    """
289    result = _mol_reader(fmt='mol2',
290                         filepath_or_buffer=filepath_or_buffer,
291                         usecols=usecols,
292                         molecule_column=molecule_column,
293                         molecule_name_column=molecule_name_column,
294                         smiles_column=smiles_column,
295                         skip_bad_mols=skip_bad_mols,
296                         chunksize=chunksize,
297                         **kwargs)
298    if chunksize:
299        return result
300    else:
301        return deque(result, maxlen=1).pop()
302
303
304class ChemSeries(pd.Series):
305    """Pandas Series modified to adapt `oddt.toolkit.Molecule` objects and apply
306    molecular methods easily.
307
308    .. versionadded:: 0.3
309    """
310    def __le__(self, other):
311        """ Substructure searching.
312        `chemseries < mol`: are molecules in series substructures of a `mol`
313        """
314        if (isinstance(other, oddt.toolkit.Molecule) and
315           isinstance(self[0], oddt.toolkit.Molecule)):
316            return self.map(lambda x: oddt.toolkit.Smarts(x.smiles).match(other))
317        else:
318            return super(ChemSeries, self).__le__(other)
319
320    def __ge__(self, other):
321        """ Substructure searching.
322        `chemseries > mol`: is `mol` a substructure of molecules in series
323        """
324        if (isinstance(other, oddt.toolkit.Molecule) and
325           isinstance(self[0], oddt.toolkit.Molecule)):
326            smarts = oddt.toolkit.Smarts(other.smiles)
327            return self.map(lambda x: smarts.match(x))
328        else:
329            return super(ChemSeries, self).__ge__(other)
330
331    def __or__(self, other):
332        """ Tanimoto coefficient """
333        if (isinstance(self[0], oddt.toolkit.Fingerprint) and
334           isinstance(other, oddt.toolkit.Fingerprint)):
335            return self.map(lambda x: x | other)
336        else:
337            return super(ChemSeries, self).__or__(other)
338
339    def calcfp(self, *args, **kwargs):
340        """Helper function to map FP calculation throuugh the series"""
341        assert(isinstance(self[0], oddt.toolkit.Molecule))
342        return self.map(lambda x: x.calcfp(*args, **kwargs))
343
344    def to_smiles(self, filepath_or_buffer=None):
345        return _mol_writer(self, fmt='smi', filepath_or_buffer=filepath_or_buffer)
346
347    def to_sdf(self, filepath_or_buffer=None):
348        return _mol_writer(self, fmt='sdf', filepath_or_buffer=filepath_or_buffer)
349
350    def to_mol2(self, filepath_or_buffer=None):
351        return _mol_writer(self, fmt='mol2', filepath_or_buffer=filepath_or_buffer)
352
353    @property
354    def _constructor(self):
355        """ Force new class to be usead as constructor """
356        return ChemSeries
357
358    @property
359    def _constructor_expanddim(self):
360        """ Force new class to be usead as constructor when expandig dims """
361        return ChemDataFrame
362
363
364class ChemDataFrame(pd.DataFrame):
365    """Chemical DataFrame object, which contains molecules column of
366    `oddt.toolkit.Molecule` objects. Rich display of moleucles (2D) is available
367    in iPython Notebook. Additional `to_sdf` and `to_mol2` methods make writing
368    to molecular formats easy.
369
370    .. versionadded:: 0.3
371
372    Notes
373    -----
374    Thanks to: http://blog.snapdragon.cc/2015/05/05/subclass-pandas-dataframe-to-save-custom-attributes/
375    """
376    _metadata = ['_molecule_column']
377    _molecule_column = None
378
379    def to_sdf(self,
380               filepath_or_buffer=None,
381               update_properties=True,
382               molecule_column=None,
383               columns=None):
384        """Write DataFrame to SDF file.
385
386        .. versionadded:: 0.3
387
388        Parameters
389        ----------
390        filepath_or_buffer : string or None
391            File path
392
393        update_properties : bool, optional (default=True)
394            Switch to update properties from the DataFrames to the molecules
395            while writting.
396
397        molecule_column : string or None, optional (default='mol')
398            Name of molecule column. If None the molecules will be skipped.
399
400        columns : list or None, optional (default=None)
401            A list of columns to write to file. If None then all available
402            fields are written.
403        """
404        molecule_column = molecule_column or self._molecule_column
405        return _mol_writer(self,
406                           filepath_or_buffer=filepath_or_buffer,
407                           update_properties=update_properties,
408                           fmt='sdf',
409                           molecule_column=molecule_column,
410                           columns=columns)
411
412    def to_mol2(self,
413                filepath_or_buffer=None,
414                update_properties=True,
415                molecule_column='mol',
416                columns=None):
417        """Write DataFrame to Mol2 file.
418
419        .. versionadded:: 0.3
420
421        Parameters
422        ----------
423        filepath_or_buffer : string or None
424            File path
425
426        update_properties : bool, optional (default=True)
427            Switch to update properties from the DataFrames to the molecules
428            while writting.
429
430        molecule_column : string or None, optional (default='mol')
431            Name of molecule column. If None the molecules will be skipped.
432
433        columns : list or None, optional (default=None)
434            A list of columns to write to file. If None then all available
435            fields are written.
436        """
437        molecule_column = molecule_column or self._molecule_column
438        return _mol_writer(self,
439                           fmt='mol2',
440                           filepath_or_buffer=filepath_or_buffer,
441                           update_properties=update_properties,
442                           molecule_column=molecule_column,
443                           columns=columns)
444
445    def to_html(self, *args, **kwargs):
446        """Patched rendering in HTML - don't escape HTML inside the cells.
447        Docs are copied from parent
448        """
449        kwargs['escape'] = False
450        return super(ChemDataFrame, self).to_html(*args, **kwargs)
451
452    def to_csv(self, *args, **kwargs):
453        """ Docs are copied from parent """
454        if self._molecule_column and ('columns' not in kwargs or
455                                      kwargs['columns'] is None or
456                                      self._molecule_column in kwargs['columns']):
457            frm_copy = self.copy(deep=True)
458            smi = frm_copy[self._molecule_column].map(lambda x: x.smiles)
459            frm_copy[self._molecule_column] = smi
460            return super(ChemDataFrame, frm_copy).to_csv(*args, **kwargs)
461        else:
462            return super(ChemDataFrame, self).to_csv(*args, **kwargs)
463
464    def to_excel(self, *args, **kwargs):
465        """ Docs are copied from parent """
466
467        if 'columns' in kwargs:
468            columns = kwargs['columns']
469        else:
470            columns = self.columns.tolist()
471
472        if 'molecule_column' in kwargs:
473            molecule_column = kwargs['molecule_column']
474        else:
475            molecule_column = self._molecule_column
476
477        molecule_column_idx = columns.index(molecule_column)
478        if 'index' not in kwargs or ('index' in kwargs and kwargs['index']):
479            molecule_column_idx += 1
480        size = kwargs.pop('size') if 'size' in kwargs else (200, 200)
481        excel_writer = args[0]
482        if isinstance(excel_writer, str):
483            excel_writer = pd.ExcelWriter(excel_writer, engine='xlsxwriter')
484        assert excel_writer.engine == 'xlsxwriter'
485
486        frm_copy = self.copy(deep=True)
487        smi = frm_copy[molecule_column].map(lambda x: x.smiles)
488        frm_copy[molecule_column] = smi
489
490        super(ChemDataFrame, frm_copy).to_excel(excel_writer, *args[1:], **kwargs)
491
492        sheet = excel_writer.sheets['Sheet1']  # TODO: Get appropriate sheet name
493        sheet.set_column(molecule_column_idx, molecule_column_idx,
494                         width=size[1] / 6.)
495        for i, mol in enumerate(self[molecule_column]):
496            if mol is None:
497                continue
498            img = BytesIO()
499            png = mol.clone.write('png', size=size)
500            if isinstance(png, text_type):
501                png = png.encode('utf-8', errors='surrogateescape')
502            img.write(png)
503            sheet.write_string(i + 1, molecule_column_idx, "")
504            sheet.insert_image(i + 1,
505                               molecule_column_idx,
506                               'dummy',
507                               {'image_data': img,
508                                'positioning': 2,
509                                'x_offset': 1,
510                                'y_offset': 1})
511            sheet.set_row(i + 1, height=size[0])
512        excel_writer.save()
513
514    @property
515    def _constructor(self):
516        """ Force new class to be usead as constructor """
517        return ChemDataFrame
518
519    @property
520    def _constructor_sliced(self):
521        """ Force new class to be usead as constructor when slicing """
522        return ChemSeries
523
524    @property
525    def _constructor_expanddim(self):
526        """ Force new class to be usead as constructor when expandig dims """
527        return ChemPanel
528
529
530# Copy some docscrings from upstream classes
531for method in ['to_html', 'to_csv', 'to_excel']:
532    try:
533        getattr(ChemDataFrame, method).__doc__ = getattr(pd.DataFrame, method).__doc__
534    except AttributeError:  # Python 2 compatible
535        getattr(ChemDataFrame, method).__func__.__doc__ = getattr(pd.DataFrame, method).__func__.__doc__
536
537
538class ChemPanel(pd.Panel):
539    """Modified `pandas.Panel` to adopt higher dimension data than
540    `ChemDataFrame`. Main purpose is to store molecular fingerprints in one
541    column and keep 2D numpy array underneath.
542
543    .. versionadded:: 0.3
544    """
545    _metadata = ['_molecule_column']
546    _molecule_column = None
547
548    @property
549    def _constructor(self):
550        """ Force new class to be usead as constructor """
551        return ChemPanel
552
553    @property
554    def _constructor_sliced(self):
555        """ Force new class to be usead as constructor when slicing """
556        return ChemDataFrame
557