1""" 2idxml - idXML file reader 3========================= 4 5Summary 6------- 7 8**idXML** is a format specified in the 9`OpenMS <http://open-ms.sourceforge.net/about/>`_ project. 10It defines a list of peptide identifications. 11 12This module provides a minimalistic way to extract information from idXML 13files. You can use the old functional interface (:py:func:`read`) or the new 14object-oriented interface (:py:class:`IDXML`) to iterate over entries in 15``<PeptideIdentification>`` elements. Note that each entry can contain more than one PSM 16(peptide-spectrum match). They are accessible with ``'PeptideHit'`` key. 17:py:class:`IDXML` objects also support direct indexing by element ID. 18 19Data access 20----------- 21 22 :py:class:`IDXML` - a class representing a single idXML file. 23 Other data access functions use this class internally. 24 25 :py:func:`read` - iterate through peptide-spectrum matches in an idXML 26 file. Data from a single PSM group are converted to a human-readable dict. 27 Basically creates an :py:class:`IDXML` object and reads it. 28 29 :py:func:`chain` - read multiple files at once. 30 31 :py:func:`chain.from_iterable` - read multiple files at once, using an 32 iterable of files. 33 34 :py:func:`DataFrame` - read idXML files into a :py:class:`pandas.DataFrame`. 35 36Target-decoy approach 37--------------------- 38 39 :py:func:`filter` - read a chain of idXML files and filter to a certain 40 FDR using TDA. 41 42 :py:func:`filter.chain` - chain a series of filters applied independently to 43 several files. 44 45 :py:func:`filter.chain.from_iterable` - chain a series of filters applied 46 independently to an iterable of files. 47 48 :py:func:`filter_df` - filter idXML files and return a :py:class:`pandas.DataFrame`. 49 50 :py:func:`is_decoy` - determine if a "SpectrumIdentificationResult" should be 51 consiudered decoy. 52 53 :py:func:`fdr` - estimate the false discovery rate of a set of identifications 54 using the target-decoy approach. 55 56 :py:func:`qvalues` - get an array of scores and local FDR values for a PSM 57 set using the target-decoy approach. 58 59Deprecated functions 60-------------------- 61 62 :py:func:`version_info` - get information about idXML version and schema. 63 You can just read the corresponding attribute of the :py:class:`IDXML` 64 object. 65 66 :py:func:`get_by_id` - get an element by its ID and extract the data from it. 67 You can just call the corresponding method of the :py:class:`IDXML` 68 object. 69 70 :py:func:`iterfind` - iterate over elements in an idXML file. 71 You can just call the corresponding method of the :py:class:`IDXML` 72 object. 73 74Dependencies 75------------ 76 77This module requires :py:mod:`lxml`. 78 79------------------------------------------------------------------------------- 80""" 81 82# Copyright 2020 Lev Levitsky 83# 84# Licensed under the Apache License, Version 2.0 (the "License"); 85# you may not use this file except in compliance with the License. 86# You may obtain a copy of the License at 87# 88# http://www.apache.org/licenses/LICENSE-2.0 89# 90# Unless required by applicable law or agreed to in writing, software 91# distributed under the License is distributed on an "AS IS" BASIS, 92# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 93# See the License for the specific language governing permissions and 94# limitations under the License. 95 96 97import warnings 98from .. import auxiliary as aux 99from .. import xml, _schema_defaults 100 101 102class IDXML(xml.IndexedXML): 103 """Parser class for idXML files.""" 104 file_format = 'idXML' 105 _root_element = 'IdXML' 106 _default_schema = _schema_defaults._idxml_schema_defaults 107 _default_version = '1.5' 108 _default_iter_tag = 'PeptideIdentification' 109 _structures_to_flatten = {} 110 _indexed_tags = {'ProteinHit'} 111 _schema_location_param = 'noNamespaceSchemaLocation' 112 113 def __init__(self, *args, **kwargs): 114 kwargs.setdefault('retrieve_refs', True) 115 super(IDXML, self).__init__(*args, **kwargs) 116 117 def _get_info_smart(self, element, **kwargs): 118 """Extract the info in a smart way depending on the element type""" 119 name = xml._local_name(element) 120 kwargs = dict(kwargs) 121 rec = kwargs.pop("recursive", None) 122 123 # Try not to recursively unpack the root element 124 # unless the user really wants to. 125 if name == self._root_element: 126 info = self._get_info(element, recursive=(rec if rec is not None else False), **kwargs) 127 else: 128 info = self._get_info(element, recursive=(rec if rec is not None else True), **kwargs) 129 for k in ['start', 'end']: 130 v = info.get(k) 131 if isinstance(v, list) and len(v) == 2: 132 info[k] = [int(x) for x in v[0].split()] 133 for k in ['aa_before', 'aa_after']: 134 if k in info: 135 info[k] = info[k].split() 136 return info 137 138 def _retrieve_refs(self, info, **kwargs): 139 """Retrieves and embeds the data for each attribute in `info` that 140 ends in _ref. Removes the id attribute from `info`""" 141 for k, v in dict(info).items(): 142 if k[-5:] == '_refs': 143 try: 144 by_id = [self.get_by_id(x, retrieve_refs=True) for x in v.split()] 145 except KeyError: 146 warnings.warn('Ignoring unresolved reference: ' + v) 147 else: 148 for x in by_id: 149 x.pop('id', None) 150 info[k[:-5]] = by_id 151 del info[k] 152 153 154def read(source, **kwargs): 155 """Parse `source` and iterate through peptide-spectrum matches. 156 157 .. note:: This function is provided for backward compatibility only. 158 It simply creates an :py:class:`IDXML` instance using 159 provided arguments and returns it. 160 161 Parameters 162 ---------- 163 source : str or file 164 A path to a target IDXML file or the file object itself. 165 166 recursive : bool, optional 167 If :py:const:`False`, subelements will not be processed when 168 extracting info from elements. Default is :py:const:`True`. 169 170 retrieve_refs : bool, optional 171 If :py:const:`True`, additional information from references will be 172 automatically added to the results. The file processing time will 173 increase. Default is :py:const:`True`. 174 175 iterative : bool, optional 176 Specifies whether iterative XML parsing should be used. Iterative 177 parsing significantly reduces memory usage and may be just a little 178 slower. When `retrieve_refs` is :py:const:`True`, however, it is 179 highly recommended to disable iterative parsing if possible. 180 Default value is :py:const:`True`. 181 182 read_schema : bool, optional 183 If :py:const:`True`, attempt to extract information from the XML schema 184 mentioned in the IDXML header (default). Otherwise, use default 185 parameters. Disable this to avoid waiting on slow network connections or 186 if you don't like to get the related warnings. 187 188 build_id_cache : bool, optional 189 Defines whether a cache of element IDs should be built and stored on the 190 created :py:class:`IDXML` instance. Default value is the value of 191 `retrieve_refs`. 192 193 .. note:: This parameter is ignored when ``use_index`` is ``True`` (default). 194 195 use_index : bool, optional 196 Defines whether an index of byte offsets needs to be created for 197 the indexed elements. If :py:const:`True` (default), `build_id_cache` is ignored. 198 199 indexed_tags : container of bytes, optional 200 Defines which elements need to be indexed. Empty set by default. 201 202 Returns 203 ------- 204 out : IDXML 205 An iterator over the dicts with PSM properties. 206 """ 207 kwargs = kwargs.copy() 208 kwargs.setdefault('retrieve_refs', True) 209 kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs')) 210 return IDXML(source, **kwargs) 211 212 213def iterfind(source, path, **kwargs): 214 """Parse `source` and yield info on elements with specified local 215 name or by specified "XPath". 216 217 .. note:: This function is provided for backward compatibility only. 218 If you do multiple :py:func:`iterfind` calls on one file, you should 219 create an :py:class:`IDXML` object and use its 220 :py:meth:`!iterfind` method. 221 222 Parameters 223 ---------- 224 source : str or file 225 File name or file-like object. 226 227 path : str 228 Element name or XPath-like expression. Only local names separated 229 with slashes are accepted. An asterisk (`*`) means any element. 230 You can specify a single condition in the end, such as: 231 ``"/path/to/element[some_value>1.5]"`` 232 Note: you can do much more powerful filtering using plain Python. 233 The path can be absolute or "free". Please don't specify 234 namespaces. 235 236 recursive : bool, optional 237 If :py:const:`False`, subelements will not be processed when 238 extracting info from elements. Default is :py:const:`True`. 239 240 retrieve_refs : bool, optional 241 If :py:const:`True`, additional information from references will be 242 automatically added to the results. The file processing time will 243 increase. Default is :py:const:`False`. 244 245 iterative : bool, optional 246 Specifies whether iterative XML parsing should be used. Iterative 247 parsing significantly reduces memory usage and may be just a little 248 slower. When `retrieve_refs` is :py:const:`True`, however, it is 249 highly recommended to disable iterative parsing if possible. 250 Default value is :py:const:`True`. 251 252 read_schema : bool, optional 253 If :py:const:`True`, attempt to extract information from the XML schema 254 mentioned in the IDXML header (default). Otherwise, use default 255 parameters. Disable this to avoid waiting on slow network connections or 256 if you don't like to get the related warnings. 257 258 build_id_cache : bool, optional 259 Defines whether a cache of element IDs should be built and stored on the 260 created :py:class:`IDXML` instance. Default value is the value of 261 `retrieve_refs`. 262 263 Returns 264 ------- 265 out : iterator 266 """ 267 kwargs = kwargs.copy() 268 kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs')) 269 return IDXML(source, **kwargs).iterfind(path, **kwargs) 270 271 272version_info = xml._make_version_info(IDXML) 273 274 275def get_by_id(source, elem_id, **kwargs): 276 """Parse `source` and return the element with `id` attribute equal 277 to `elem_id`. Returns :py:const:`None` if no such element is found. 278 279 .. note:: This function is provided for backward compatibility only. 280 If you do multiple :py:func:`get_by_id` calls on one file, you should 281 create an :py:class:`IDXML` object and use its 282 :py:meth:`!get_by_id` method. 283 284 Parameters 285 ---------- 286 source : str or file 287 A path to a target mzIdentML file of the file object itself. 288 289 elem_id : str 290 The value of the `id` attribute to match. 291 292 Returns 293 ------- 294 out : :py:class:`dict` or :py:const:`None` 295 """ 296 return IDXML(source, **kwargs).get_by_id(elem_id, **kwargs) 297 298 299chain = aux.ChainBase._make_chain(IDXML) 300 301 302def is_decoy(psm, prefix=None): 303 """Given a PSM dict, return :py:const:`True` if it is marked as decoy, 304 and :py:const:`False` otherwise. 305 306 Parameters 307 ---------- 308 psm : dict 309 A dict, as yielded by :py:func:`read`. 310 prefix : ignored 311 312 Returns 313 ------- 314 out : bool 315 """ 316 return psm['PeptideHit'][0]['target_decoy'] == 'decoy' 317 318 319def DataFrame(*args, **kwargs): 320 """Read idXML files into a :py:class:`pandas.DataFrame`. 321 322 Requires :py:mod:`pandas`. 323 324 .. warning :: Only the first 'PeptideHit' element is considered in every 'PeptideIdentification'. 325 326 Parameters 327 ---------- 328 *args 329 Passed to :py:func:`chain` 330 331 **kwargs 332 Passed to :py:func:`chain` 333 334 sep : str or None, keyword only, optional 335 Some values related to PSMs (such as protein information) are variable-length 336 lists. If `sep` is a :py:class:`str`, they will be packed into single string using 337 this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is 338 :py:const:`None`. 339 340 Returns 341 ------- 342 out : pandas.DataFrame 343 """ 344 import pandas as pd 345 data = [] 346 347 sep = kwargs.pop('sep', None) 348 with chain(*args, **kwargs) as f: 349 for item in f: 350 info = {} 351 for k, v in item.items(): 352 if isinstance(v, (str, int, float)): 353 info[k] = v 354 peptide_hit = item.get('PeptideHit', [None])[0] 355 if peptide_hit is not None: 356 info.update((k, v) for k, v in peptide_hit.items() if isinstance(v, (str, int, float))) 357 protein = peptide_hit.get('protein') 358 if protein: 359 accessions, isd, starts, ends, scores, aa_bs, aa_as = [], [], [], [], [], [], [] 360 for d, start, end, aab, aaa in zip(protein, peptide_hit['start'], peptide_hit['end'], peptide_hit['aa_before'], peptide_hit['aa_after']): 361 accessions.append(d.get('accession')) 362 isd.append(d.get('target_decoy')) 363 scores.append(d.get('score')) 364 starts.append(start) 365 ends.append(end) 366 aa_bs.append(aab) 367 aa_as.append(aaa) 368 369 isd = all(x == 'decoy' for x in isd) 370 if sep is not None: 371 if all(isinstance(acc, str) for acc in accessions): 372 accessions = sep.join(accessions) 373 if all(isinstance(aaa, str) for aaa in aa_as): 374 aa_as = sep.join(aa_as) 375 if all(isinstance(aab, str) for aab in aa_bs): 376 aa_bs = sep.join(aa_bs) 377 if all(acc is None for acc in accessions): 378 accessions = None 379 380 info.update((k, v) for k, v in protein[0].items() if isinstance(v, (str, int, float, list))) 381 info['accession'] = accessions 382 info['is decoy'] = isd 383 info['start'] = starts 384 info['end'] = ends 385 info['aa_before'] = aa_bs 386 info['aa_after'] = aa_as 387 data.append(info) 388 df = pd.DataFrame(data) 389 return df 390 391 392def filter_df(*args, **kwargs): 393 """Read idXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. 394 Positional arguments can be idXML files or DataFrames. 395 396 Requires :py:mod:`pandas`. 397 398 .. warning :: Only the first 'PeptideHit' element is considered in every 'PeptideIdentification'. 399 400 Parameters 401 ---------- 402 key : str / iterable / callable, keyword only, optional 403 Peptide identification score. Default is 'score'. You will probably need to change it. 404 is_decoy : str / iterable / callable, keyword only, optional 405 Default is 'is decoy'. 406 *args 407 Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. 408 **kwargs 409 Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. 410 411 Returns 412 ------- 413 out : pandas.DataFrame 414 """ 415 import pandas as pd 416 kwargs.setdefault('key', 'score') 417 if all(isinstance(arg, pd.DataFrame) for arg in args): 418 df = pd.concat(args) 419 else: 420 df = DataFrame(*args, **kwargs) 421 if 'is_decoy' not in kwargs: 422 kwargs['is_decoy'] = 'is decoy' 423 return aux.filter(df, **kwargs) 424 425 426fdr = aux._make_fdr(is_decoy, None) 427_key = lambda x: x['PeptideHit'][0]['score'] 428qvalues = aux._make_qvalues(chain, is_decoy, None, _key) 429filter = aux._make_filter(chain, is_decoy, None, _key, qvalues) 430filter.chain = aux._make_chain(filter, 'filter', True) 431