1"""
2idxml - idXML file reader
3=========================
4
5Summary
6-------
7
8**idXML** is a format specified in the
9`OpenMS <http://open-ms.sourceforge.net/about/>`_ project.
10It defines a list of peptide identifications.
11
12This module provides a minimalistic way to extract information from idXML
13files. You can use the old functional interface (:py:func:`read`) or the new
14object-oriented interface (:py:class:`IDXML`) to iterate over entries in
15``<PeptideIdentification>`` elements. Note that each entry can contain more than one PSM
16(peptide-spectrum match). They are accessible with ``'PeptideHit'`` key.
17:py:class:`IDXML` objects also support direct indexing by element ID.
18
19Data access
20-----------
21
22  :py:class:`IDXML` - a class representing a single idXML file.
23  Other data access functions use this class internally.
24
25  :py:func:`read` - iterate through peptide-spectrum matches in an idXML
26  file. Data from a single PSM group are converted to a human-readable dict.
27  Basically creates an :py:class:`IDXML` object and reads it.
28
29  :py:func:`chain` - read multiple files at once.
30
31  :py:func:`chain.from_iterable` - read multiple files at once, using an
32  iterable of files.
33
34  :py:func:`DataFrame` - read idXML files into a :py:class:`pandas.DataFrame`.
35
36Target-decoy approach
37---------------------
38
39  :py:func:`filter` - read a chain of idXML files and filter to a certain
40  FDR using TDA.
41
42  :py:func:`filter.chain` - chain a series of filters applied independently to
43  several files.
44
45  :py:func:`filter.chain.from_iterable` - chain a series of filters applied
46  independently to an iterable of files.
47
48  :py:func:`filter_df` - filter idXML files and return a :py:class:`pandas.DataFrame`.
49
50  :py:func:`is_decoy` - determine if a "SpectrumIdentificationResult" should be
51  consiudered decoy.
52
53  :py:func:`fdr` - estimate the false discovery rate of a set of identifications
54  using the target-decoy approach.
55
56  :py:func:`qvalues` - get an array of scores and local FDR values for a PSM
57  set using the target-decoy approach.
58
59Deprecated functions
60--------------------
61
62  :py:func:`version_info` - get information about idXML version and schema.
63  You can just read the corresponding attribute of the :py:class:`IDXML`
64  object.
65
66  :py:func:`get_by_id` - get an element by its ID and extract the data from it.
67  You can just call the corresponding method of the :py:class:`IDXML`
68  object.
69
70  :py:func:`iterfind` - iterate over elements in an idXML file.
71  You can just call the corresponding method of the :py:class:`IDXML`
72  object.
73
74Dependencies
75------------
76
77This module requires :py:mod:`lxml`.
78
79-------------------------------------------------------------------------------
80"""
81
82#   Copyright 2020 Lev Levitsky
83#
84#   Licensed under the Apache License, Version 2.0 (the "License");
85#   you may not use this file except in compliance with the License.
86#   You may obtain a copy of the License at
87#
88#     http://www.apache.org/licenses/LICENSE-2.0
89#
90#   Unless required by applicable law or agreed to in writing, software
91#   distributed under the License is distributed on an "AS IS" BASIS,
92#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
93#   See the License for the specific language governing permissions and
94#   limitations under the License.
95
96
97import warnings
98from .. import auxiliary as aux
99from .. import xml, _schema_defaults
100
101
102class IDXML(xml.IndexedXML):
103    """Parser class for idXML files."""
104    file_format = 'idXML'
105    _root_element = 'IdXML'
106    _default_schema = _schema_defaults._idxml_schema_defaults
107    _default_version = '1.5'
108    _default_iter_tag = 'PeptideIdentification'
109    _structures_to_flatten = {}
110    _indexed_tags = {'ProteinHit'}
111    _schema_location_param = 'noNamespaceSchemaLocation'
112
113    def __init__(self, *args, **kwargs):
114        kwargs.setdefault('retrieve_refs', True)
115        super(IDXML, self).__init__(*args, **kwargs)
116
117    def _get_info_smart(self, element, **kwargs):
118        """Extract the info in a smart way depending on the element type"""
119        name = xml._local_name(element)
120        kwargs = dict(kwargs)
121        rec = kwargs.pop("recursive", None)
122
123        # Try not to recursively unpack the root element
124        # unless the user really wants to.
125        if name == self._root_element:
126            info = self._get_info(element, recursive=(rec if rec is not None else False), **kwargs)
127        else:
128            info = self._get_info(element, recursive=(rec if rec is not None else True), **kwargs)
129        for k in ['start', 'end']:
130            v = info.get(k)
131            if isinstance(v, list) and len(v) == 2:
132                info[k] = [int(x) for x in v[0].split()]
133        for k in ['aa_before', 'aa_after']:
134            if k in info:
135                info[k] = info[k].split()
136        return info
137
138    def _retrieve_refs(self, info, **kwargs):
139        """Retrieves and embeds the data for each attribute in `info` that
140        ends in _ref. Removes the id attribute from `info`"""
141        for k, v in dict(info).items():
142            if k[-5:] == '_refs':
143                try:
144                    by_id = [self.get_by_id(x, retrieve_refs=True) for x in v.split()]
145                except KeyError:
146                    warnings.warn('Ignoring unresolved reference: ' + v)
147                else:
148                    for x in by_id:
149                        x.pop('id', None)
150                    info[k[:-5]] = by_id
151                    del info[k]
152
153
154def read(source, **kwargs):
155    """Parse `source` and iterate through peptide-spectrum matches.
156
157    .. note:: This function is provided for backward compatibility only.
158        It simply creates an :py:class:`IDXML` instance using
159        provided arguments and returns it.
160
161    Parameters
162    ----------
163    source : str or file
164        A path to a target IDXML file or the file object itself.
165
166    recursive : bool, optional
167        If :py:const:`False`, subelements will not be processed when
168        extracting info from elements. Default is :py:const:`True`.
169
170    retrieve_refs : bool, optional
171        If :py:const:`True`, additional information from references will be
172        automatically added to the results. The file processing time will
173        increase. Default is :py:const:`True`.
174
175    iterative : bool, optional
176        Specifies whether iterative XML parsing should be used. Iterative
177        parsing significantly reduces memory usage and may be just a little
178        slower. When `retrieve_refs` is :py:const:`True`, however, it is
179        highly recommended to disable iterative parsing if possible.
180        Default value is :py:const:`True`.
181
182    read_schema : bool, optional
183        If :py:const:`True`, attempt to extract information from the XML schema
184        mentioned in the IDXML header (default). Otherwise, use default
185        parameters. Disable this to avoid waiting on slow network connections or
186        if you don't like to get the related warnings.
187
188    build_id_cache : bool, optional
189        Defines whether a cache of element IDs should be built and stored on the
190        created :py:class:`IDXML` instance. Default value is the value of
191        `retrieve_refs`.
192
193        .. note:: This parameter is ignored when ``use_index`` is ``True`` (default).
194
195    use_index : bool, optional
196        Defines whether an index of byte offsets needs to be created for
197        the indexed elements. If :py:const:`True` (default), `build_id_cache` is ignored.
198
199    indexed_tags : container of bytes, optional
200        Defines which elements need to be indexed. Empty set by default.
201
202    Returns
203    -------
204    out : IDXML
205       An iterator over the dicts with PSM properties.
206    """
207    kwargs = kwargs.copy()
208    kwargs.setdefault('retrieve_refs', True)
209    kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs'))
210    return IDXML(source, **kwargs)
211
212
213def iterfind(source, path, **kwargs):
214    """Parse `source` and yield info on elements with specified local
215    name or by specified "XPath".
216
217    .. note:: This function is provided for backward compatibility only.
218        If you do multiple :py:func:`iterfind` calls on one file, you should
219        create an :py:class:`IDXML` object and use its
220        :py:meth:`!iterfind` method.
221
222    Parameters
223    ----------
224    source : str or file
225        File name or file-like object.
226
227    path : str
228        Element name or XPath-like expression. Only local names separated
229        with slashes are accepted. An asterisk (`*`) means any element.
230        You can specify a single condition in the end, such as:
231        ``"/path/to/element[some_value>1.5]"``
232        Note: you can do much more powerful filtering using plain Python.
233        The path can be absolute or "free". Please don't specify
234        namespaces.
235
236    recursive : bool, optional
237        If :py:const:`False`, subelements will not be processed when
238        extracting info from elements. Default is :py:const:`True`.
239
240    retrieve_refs : bool, optional
241        If :py:const:`True`, additional information from references will be
242        automatically added to the results. The file processing time will
243        increase. Default is :py:const:`False`.
244
245    iterative : bool, optional
246        Specifies whether iterative XML parsing should be used. Iterative
247        parsing significantly reduces memory usage and may be just a little
248        slower. When `retrieve_refs` is :py:const:`True`, however, it is
249        highly recommended to disable iterative parsing if possible.
250        Default value is :py:const:`True`.
251
252    read_schema : bool, optional
253        If :py:const:`True`, attempt to extract information from the XML schema
254        mentioned in the IDXML header (default). Otherwise, use default
255        parameters. Disable this to avoid waiting on slow network connections or
256        if you don't like to get the related warnings.
257
258    build_id_cache : bool, optional
259        Defines whether a cache of element IDs should be built and stored on the
260        created :py:class:`IDXML` instance. Default value is the value of
261        `retrieve_refs`.
262
263    Returns
264    -------
265    out : iterator
266    """
267    kwargs = kwargs.copy()
268    kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs'))
269    return IDXML(source, **kwargs).iterfind(path, **kwargs)
270
271
272version_info = xml._make_version_info(IDXML)
273
274
275def get_by_id(source, elem_id, **kwargs):
276    """Parse `source` and return the element with `id` attribute equal
277    to `elem_id`. Returns :py:const:`None` if no such element is found.
278
279    .. note:: This function is provided for backward compatibility only.
280        If you do multiple :py:func:`get_by_id` calls on one file, you should
281        create an :py:class:`IDXML` object and use its
282        :py:meth:`!get_by_id` method.
283
284    Parameters
285    ----------
286    source : str or file
287        A path to a target mzIdentML file of the file object itself.
288
289    elem_id : str
290        The value of the `id` attribute to match.
291
292    Returns
293    -------
294    out : :py:class:`dict` or :py:const:`None`
295    """
296    return IDXML(source, **kwargs).get_by_id(elem_id, **kwargs)
297
298
299chain = aux.ChainBase._make_chain(IDXML)
300
301
302def is_decoy(psm, prefix=None):
303    """Given a PSM dict, return :py:const:`True` if it is marked as decoy,
304    and :py:const:`False` otherwise.
305
306    Parameters
307    ----------
308    psm : dict
309        A dict, as yielded by :py:func:`read`.
310    prefix : ignored
311
312    Returns
313    -------
314    out : bool
315    """
316    return psm['PeptideHit'][0]['target_decoy'] == 'decoy'
317
318
319def DataFrame(*args, **kwargs):
320    """Read idXML files into a :py:class:`pandas.DataFrame`.
321
322    Requires :py:mod:`pandas`.
323
324    .. warning :: Only the first 'PeptideHit' element is considered in every 'PeptideIdentification'.
325
326    Parameters
327    ----------
328    *args
329        Passed to :py:func:`chain`
330
331    **kwargs
332        Passed to :py:func:`chain`
333
334    sep : str or None, keyword only, optional
335        Some values related to PSMs (such as protein information) are variable-length
336        lists. If `sep` is a :py:class:`str`, they will be packed into single string using
337        this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is
338        :py:const:`None`.
339
340    Returns
341    -------
342    out : pandas.DataFrame
343    """
344    import pandas as pd
345    data = []
346
347    sep = kwargs.pop('sep', None)
348    with chain(*args, **kwargs) as f:
349        for item in f:
350            info = {}
351            for k, v in item.items():
352                if isinstance(v, (str, int, float)):
353                    info[k] = v
354            peptide_hit = item.get('PeptideHit', [None])[0]
355            if peptide_hit is not None:
356                info.update((k, v) for k, v in peptide_hit.items() if isinstance(v, (str, int, float)))
357                protein = peptide_hit.get('protein')
358                if protein:
359                    accessions, isd, starts, ends, scores, aa_bs, aa_as = [], [], [], [], [], [], []
360                    for d, start, end, aab, aaa in zip(protein, peptide_hit['start'], peptide_hit['end'], peptide_hit['aa_before'], peptide_hit['aa_after']):
361                        accessions.append(d.get('accession'))
362                        isd.append(d.get('target_decoy'))
363                        scores.append(d.get('score'))
364                        starts.append(start)
365                        ends.append(end)
366                        aa_bs.append(aab)
367                        aa_as.append(aaa)
368
369                    isd = all(x == 'decoy' for x in isd)
370                    if sep is not None:
371                        if all(isinstance(acc, str) for acc in accessions):
372                            accessions = sep.join(accessions)
373                        if all(isinstance(aaa, str) for aaa in aa_as):
374                            aa_as = sep.join(aa_as)
375                        if all(isinstance(aab, str) for aab in aa_bs):
376                            aa_bs = sep.join(aa_bs)
377                    if all(acc is None for acc in accessions):
378                        accessions = None
379
380                    info.update((k, v) for k, v in protein[0].items() if isinstance(v, (str, int, float, list)))
381                    info['accession'] = accessions
382                    info['is decoy'] = isd
383                    info['start'] = starts
384                    info['end'] = ends
385                    info['aa_before'] = aa_bs
386                    info['aa_after'] = aa_as
387            data.append(info)
388    df = pd.DataFrame(data)
389    return df
390
391
392def filter_df(*args, **kwargs):
393    """Read idXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs.
394    Positional arguments can be idXML files or DataFrames.
395
396    Requires :py:mod:`pandas`.
397
398    .. warning :: Only the first 'PeptideHit' element is considered in every 'PeptideIdentification'.
399
400    Parameters
401    ----------
402    key : str / iterable / callable, keyword only, optional
403        Peptide identification score. Default is 'score'. You will probably need to change it.
404    is_decoy : str / iterable / callable, keyword only, optional
405        Default is 'is decoy'.
406    *args
407        Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`.
408    **kwargs
409        Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`.
410
411    Returns
412    -------
413    out : pandas.DataFrame
414    """
415    import pandas as pd
416    kwargs.setdefault('key', 'score')
417    if all(isinstance(arg, pd.DataFrame) for arg in args):
418        df = pd.concat(args)
419    else:
420        df = DataFrame(*args, **kwargs)
421    if 'is_decoy' not in kwargs:
422        kwargs['is_decoy'] = 'is decoy'
423    return aux.filter(df, **kwargs)
424
425
426fdr = aux._make_fdr(is_decoy, None)
427_key = lambda x: x['PeptideHit'][0]['score']
428qvalues = aux._make_qvalues(chain, is_decoy, None, _key)
429filter = aux._make_filter(chain, is_decoy, None, _key, qvalues)
430filter.chain = aux._make_chain(filter, 'filter', True)
431