1""" KEGG api interface. """
2from __future__ import absolute_import
3
4import os
5import warnings
6from datetime import datetime
7from operator import itemgetter
8from contextlib import closing
9
10import six
11
12from orangecontrib.bioinformatics.kegg import caching
13from orangecontrib.bioinformatics.kegg.types import Link, BInfo, Definition, OrganismSummary
14from orangecontrib.bioinformatics.kegg.caching import touch_dir, cache_entry, cached_method
15from orangecontrib.bioinformatics.kegg.service import web_service
16
17# A list of all databases with names, abbreviations
18DATABASES = [
19    ("KEGG Pathway", "pathway", "path", None),
20    ("KEGG Brite", "brite", "br", None),
21    ("KEGG Module", "module", "md", "M"),
22    ("KEGG Disease", "disease", "ds", "H"),
23    ("KEGG Drug", "drug", "dr", "D"),
24    ("KEGG Orthology", "orthology", "ko", "K"),
25    ("KEGG Genome", "genome", "genome", "T"),
26    ("KEGG Genomes", "genomes", "gn", "T"),
27    ("KEGG Genes", "genes", None, None),
28    ("KEGG Ligand", "ligand", "ligand", None),
29    ("KEGG Compound", "compound", "cpd", "C"),
30    ("KEGG Glycan", "glycan", "gl", "G"),
31    ("KEGG Reaction", "reaction", "rn", "R"),
32    ("KEGG RPair", "rpair", "rp", "RP"),
33    ("KEGG RClass", "rclass", "rc", "RC"),
34    ("KEGG Enzyme", "enzyme", "ec", "E"),
35]
36
37
38def _link_targets(links):
39    return sorted(set(map(itemgetter(1), links)))
40
41
42class KeggApi(object):
43    """
44    An abstraction of a rest KEGG API.
45    """
46
47    def __init__(self):
48        self.service = web_service()
49
50    def list_organisms(self):
51        """
52        Return a list of all available organisms,
53
54        >>> api.list_organisms()  # doctest: +ELLIPSIS
55        [OrganismSummary(entry_id='T01001', ...
56        """
57        return list(map(OrganismSummary.from_str, self.service.list.organism.get().splitlines()))
58
59    def list_pathways(self, organism):
60        """
61        Return a list of all available pathways for `organism`
62
63        >>> api.list_pathways("hsa")  # doctest: +ELLIPSIS
64        [Definition(entry_id='path:hsa00010', ...
65        """
66        return list(map(Definition.from_str, self.service.list.pathway(organism).get().splitlines()))
67
68    def list(self, db):
69        """
70        Return a list of all available entries in database `db`.
71        """
72        return list(map(Definition.from_str, self.service.list(db).get().splitlines()))
73
74    #######
75    # DBGET
76    #######
77
78    def info(self, db):
79        """
80        Return info for database `db`
81
82        >>> print(api.info("pathway"))
83        BInfo(entry_id='path', definition='KEGG Pathway Database', ...
84        """
85        result = self.service.info(db).get()
86        return BInfo.from_text(result)
87
88    def find(self, db, keywords):
89        """
90        Search database 'db' for keywords.
91        """
92        if isinstance(keywords, six.string_types):
93            keywords = [keywords]
94
95        return self.service.find(db)("+".join(keywords)).get()
96
97    def get(self, ids):
98        """
99        Retrieve database entries for `ids` list.
100        """
101        if not isinstance(ids, six.string_types):
102            # Sequence of ids
103            ids = "+".join(ids)
104
105        return self.service.get(ids).get()
106
107    def conv(self, target_db, source):
108        """
109        Return a mapping from source to target_db ids as a list of two
110        tuples [(source_id, target_id), ...].
111
112        """
113        if not isinstance(source, six.string_types):
114            source = "+".join(source)
115
116        res = self.service.conv(target_db)(source).get()
117        return [tuple(line.split("\t")) for line in res.splitlines()]
118
119    def link(self, target_db, source_db=None, ids=None):
120        if not (source_db or ids):
121            raise ValueError("One of 'source_db' or 'ids' must be supplied")
122        if source_db and ids:
123            raise ValueError("Only one 'source_db' or 'ids' must be supplied")
124
125        if source_db:
126            result = self.service.link(target_db)(source_db).get()
127        else:
128            result = self.service.link(target_db)("+".join(ids)).get()
129
130        return list(map(Link._make, map(str.split, result.splitlines())))
131
132    def get_genes_by_enzyme(self, enzyme_id, org):
133        return _link_targets(self.link(org, ids=[enzyme_id]))
134
135    def get_enzymes_by_gene(self, gene_id):
136        return _link_targets(self.link("ec", ids=[gene_id]))
137
138    def get_enzymes_by_compound(self, compound_id):
139        return _link_targets(self.link("ec", ids=[compound_id]))
140
141    def get_enzymes_by_glycan(self, glycan_id):
142        return _link_targets(self.link("ec", ids=[glycan_id]))
143
144    def get_enzymes_by_reaction(self, reaction_id):
145        return _link_targets(self.link("ec", ids=[reaction_id]))
146
147    def get_compounds_by_enzyme(self, enzyme_id):
148        return _link_targets(self.link("compound", ids=[enzyme_id]))
149
150    def get_compounds_by_reaction(self, reaction_id):
151        return _link_targets(self.link("compound", ids=[reaction_id]))
152
153    def get_glycans_by_enzyme(self, enzyme_id):
154        return _link_targets(self.link("gl", ids=[enzyme_id]))
155
156    def get_glycans_by_reaction(self, reaction_id):
157        return _link_targets(self.link("gl", ids=[reaction_id]))
158
159    def get_reactions_by_enzyme(self, enzyme_id):
160        return _link_targets(self.link("rn", ids=[enzyme_id]))
161
162    def get_reactions_by_compound(self, compound_id):
163        return _link_targets(self.link("rn", ids=[compound_id]))
164
165    def get_reactions_by_glycan(self, glycan_id):
166        return _link_targets(self.link("rn", ids=[glycan_id]))
167
168    ######
169    # SSDB
170    ######
171
172    # No replacement api in the KEGG REST api.
173    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
174        raise NotImplementedError
175
176    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
177        raise NotImplementedError
178
179    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
180        raise NotImplementedError
181
182    def get_paralogs_by_gene(self, genes_id, offset, limit):
183        raise NotImplementedError
184
185    #######
186    # Motif
187    #######
188
189    # No replacement api in KEGG REST api
190    def get_motifs_by_gene(self, genes_id, db):
191        raise NotImplementedError
192
193    def get_genes_by_motifs(self, motif_id_list, offset, limit):
194        raise NotImplementedError
195
196    ####
197    # KO
198    ####
199
200    def get_ko_by_gene(self, genes_id):
201        raise NotImplementedError
202
203    def get_ko_by_ko_class(self, ko_class_id):
204        raise NotImplementedError
205
206    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
207        raise NotImplementedError
208
209    def get_genes_by_ko(self, ko_id, org):
210        raise NotImplementedError
211
212    #########
213    # Pathway
214    #########
215
216    def mark_pathway_by_objects(self, pathway_id, object_id_list):
217        raise NotImplementedError
218
219    def color_pathway_by_objects(self, pathway_id, object_id_list, fg_color_list, bg_color_list):
220        raise NotImplementedError
221
222    def color_pathway_by_elements(self, pathway_id, element_id_list, fg_color_list, bg_color_list):
223        raise NotImplementedError
224
225    def get_html_of_marked_pathway_by_objects(self, pathway_id, object_id_list):
226        raise NotImplementedError
227
228    def get_html_of_colored_pathway_by_objects(self, pathway_id, object_id_list, fg_color_list, bg_color_list):
229        raise NotImplementedError
230
231    def get_html_of_colored_pathway_by_elements(self, pathway_id, element_id_list, fg_color_list, bg_color_list):
232        raise NotImplementedError
233
234    def get_references_by_pathway(self, pathway_id):
235        return self.service.get_references_by_pathway(pathway_id)
236
237    def get_element_relations_by_pathway(self, pathway_id):
238        return self.service.get_element_relations_by_pathway(pathway_id)
239
240    def get_genes_by_organism(self, organism, offset=None, limit=None):
241        if offset is not None:
242            raise NotImplementedError("offset is no longer supported")
243        if limit is not None:
244            raise NotImplementedError("limit is no longer supported.")
245
246        res = self.service.list(organism).get().splitlines()
247        return [r.split(None, 1)[0] for r in res]
248
249    def get_number_of_genes_by_organism(self, organism):
250        raise NotImplementedError
251
252    ####################
253    # Objects by pathway
254    ####################
255
256    def get_elements_by_pathway(self, pathway_id):
257        raise NotImplementedError
258
259    def get_genes_by_pathway(self, pathway_id):
260        return _link_targets(self.link("genes", ids=[pathway_id]))
261
262    def get_enzymes_by_pathway(self, pathway_id):
263        return _link_targets(self.link("ec", ids=[pathway_id]))
264
265    def get_compounds_by_pathway(self, pathway_id):
266        return _link_targets(self.link("compound", ids=[pathway_id]))
267
268    def get_drugs_by_pathway(self, pathway_id):
269        return _link_targets(self.link("drug", ids=[pathway_id]))
270
271    def get_glycans_by_pathway(self, pathway_id):
272        return _link_targets(self.link("gl", ids=[pathway_id]))
273
274    def get_reactions_by_pathway(self, pathway_id):
275        return _link_targets(self.link("rn", ids=[pathway_id]))
276
277    def get_kos_by_pathway(self, pathway_id):
278        return _link_targets(self.link("ko", ids=[pathway_id]))
279
280    #############################################
281    # Pathways and genes of a specific organism #
282    #############################################
283
284    def get_genes_pathway_organism(self, organism):
285        l = self.link("pathway", organism)
286        return list(map(tuple, l))
287
288    #####################
289    # Pathways by objects
290    #####################
291
292    # These functions returned results intersections.
293    def get_pathways_by_genes(self, gene_list):
294        raise NotImplementedError
295
296    def get_pathways_by_enzymes(self, enzyme_list):
297        raise NotImplementedError
298
299    def get_pathways_by_compounds(self, compound_list):
300        raise NotImplementedError
301
302    def get_pathways_by_drugs(self, drug_list):
303        raise NotImplementedError
304
305    def get_pathways_by_glycans(self, glycan_list):
306        raise NotImplementedError
307
308    def get_pathways_by_reactions(self, reaction_list):
309        raise NotImplementedError
310
311    def get_pathways_by_kos(self, ko_list):
312        raise NotImplementedError
313
314    ##########################
315    # Relations among pathways
316    ##########################
317
318    def get_linked_pathways(self, pathway_id):
319        if not pathway_id.startswith("path:"):
320            pathway_id = "path:" + pathway_id
321        return _link_targets(self.link("pathway", ids=[pathway_id]))
322
323
324"""
325KEGG api with caching
326"""
327
328
329try:
330    from functools import lru_cache
331except ImportError:
332    # TODO: move a copy of lru_cache in .caching if distributing this as a
333    # standalone package
334    from Orange.utils import lru_cache
335
336
337class CachedKeggApi(KeggApi):
338    def __init__(self, store=None):
339        KeggApi.__init__(self)
340        if store is None:
341            self.store = {}
342
343    # Needed API for cached decorator.
344    def cache_store(self):
345        from . import conf
346
347        path = conf.params["cache.path"]
348        touch_dir(path)
349        return caching.Sqlite3Store(os.path.join(path, "kegg_api_cache_2.sqlite3"))
350
351    def last_modified(self, args, kwargs=None):
352        return getattr(self, "default_release", "")
353
354    def set_default_release(self, release):
355        self.default_release = release
356
357    @cached_method
358    def list_organisms(self):
359        return KeggApi.list_organisms(self)
360
361    @cached_method
362    def list_pathways(self, organism):
363        return KeggApi.list_pathways(self, organism)
364
365    @cached_method
366    def list(self, db):
367        return KeggApi.list(self, db)
368
369    @lru_cache()  # not persistently cached
370    def info(self, db):
371        return KeggApi.info(self, db)
372
373    @cached_method
374    def find(self, db, keywords):
375        return KeggApi.find(self, db, keywords)
376
377    @cached_method
378    def get(self, ids):
379        if not isinstance(ids, six.string_types):
380            return self._batch_get(ids)
381        else:
382            return KeggApi.get(self, ids)
383
384    @cached_method
385    def link(self, target_db, source_db=None, ids=None):
386        return KeggApi.link(self, target_db, source_db, ids)
387
388    def _batch_get(self, ids):
389        if len(ids) > 10:
390            raise ValueError("Can batch at most 10 ids at a time.")
391
392        get = self.get
393        uncached = []
394        unmatched = set()
395
396        with closing(get.cache_store()) as store:
397            # Which ids are already cached
398            # TODO: Invalidate entries by release string.
399            for id in ids:
400                key = get.key_from_args((id,))
401                if not get.key_has_valid_cache(key, store):
402                    uncached.append(id)
403
404        if uncached:
405            # in case there are duplicate ids
406            uncached = sorted(set(uncached))
407
408            rval = KeggApi.get(self, uncached)
409
410            if rval is not None:
411                entries = rval.split("///\n")
412            else:
413                entries = []
414
415            if entries and not entries[-1].strip():
416                # Delete the last single newline entry if present
417                del entries[-1]
418
419            if len(entries) != len(uncached):
420                new_uncached, entries = match_by_ids(uncached, entries)
421                unmatched = set(uncached) - set(new_uncached)
422                uncached = new_uncached
423                warnings.warn("Unable to match entries for keys: %s." % ", ".join(map(repr, unmatched)))
424
425            with closing(get.cache_store()) as store:
426                for id, entry in zip(uncached, entries):
427                    key = get.key_from_args((id,))
428                    if entry is not None:
429                        entry = entry + "///\n"
430                    store[key] = cache_entry(entry, mtime=datetime.now())
431
432        # Finally join all the results, but drop all None objects
433
434        with closing(get.cache_store()):
435            keys = [get.key_from_args((id,)) for id in ids]
436            entries = [store[key].value for key in keys]
437
438        entries = filter(lambda e: e is not None, entries)
439
440        rval = "".join(entries)
441        return rval
442
443    @cached_method
444    def conv(self, target_db, source):
445        return KeggApi.conv(self, target_db, source)
446
447    ########
448    # LinkDB
449    ########
450
451    @cached_method
452    def get_genes_by_enzyme(self, enzyme_id, org):
453        return KeggApi.get_genes_by_enzyme(self, enzyme_id, org)
454
455    @cached_method
456    def get_enzymes_by_gene(self, genes_id):
457        return KeggApi.get_enzymes_by_gene(self, genes_id)
458
459    @cached_method
460    def get_enzymes_by_compound(self, compound_id):
461        return KeggApi.get_enzymes_by_compound(self, compound_id)
462
463    @cached_method
464    def get_enzymes_by_glycan(self, glycan_id):
465        return KeggApi.get_enzymes_by_glycan(self, glycan_id)
466
467    @cached_method
468    def get_enzymes_by_reaction(self, reaction_id):
469        return KeggApi.get_enzymes_by_reaction(self, reaction_id)
470
471    @cached_method
472    def get_compounds_by_enzyme(self, enzyme_id):
473        return KeggApi.get_compounds_by_enzyme(self, enzyme_id)
474
475    @cached_method
476    def get_compounds_by_reaction(self, reaction_id):
477        return KeggApi.get_compounds_by_reaction(self, reaction_id)
478
479    @cached_method
480    def get_glycans_by_enzyme(self, enzyme_id):
481        return KeggApi.get_glycans_by_enzyme(self, enzyme_id)
482
483    @cached_method
484    def get_glycans_by_reaction(self, reaction_id):
485        return KeggApi.get_glycans_by_reaction(self, reaction_id)
486
487    @cached_method
488    def get_reactions_by_enzyme(self, enzyme_id):
489        return KeggApi.get_reactions_by_enzyme(self, enzyme_id)
490
491    @cached_method
492    def get_reactions_by_compound(self, compound_id):
493        return KeggApi.get_reactions_by_compound(self, compound_id)
494
495    @cached_method
496    def get_reactions_by_glycan(self, glycan_id):
497        return KeggApi.get_reactions_by_glycan(self, glycan_id)
498
499    ######
500    # SSDB
501    ######
502
503    @cached_method
504    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
505        return KeggApi.get_best_best_neighbors_by_gene(self, genes_id, offset, limit)
506
507    @cached_method
508    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
509        return KeggApi.get_best_neighbors_by_gene(self, genes_id, offset, limit)
510
511    @cached_method
512    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
513        return KeggApi.get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit)
514
515    @cached_method
516    def get_paralogs_by_gene(self, genes_id, offset, limit):
517        return KeggApi.get_paralogs_by_gene(self, genes_id, offset, limit)
518
519    #######
520    # Motif
521    #######
522
523    @cached_method
524    def get_motifs_by_gene(self, genes_id, db):
525        return KeggApi.get_motifs_by_gene(self, genes_id, db)
526
527    @cached_method
528    def get_genes_by_motifs(self, motif_id_list, offset, limit):
529        return KeggApi.get_genes_by_motifs(self, motif_id_list, offset, limit)
530
531    ####
532    # KO
533    ####
534
535    @cached_method
536    def get_ko_by_gene(self, genes_id):
537        return KeggApi.get_ko_by_gene(self, genes_id)
538
539    @cached_method
540    def get_ko_by_ko_class(self, ko_class_id):
541        return KeggApi.service.get_ko_by_ko_class(self, ko_class_id)
542
543    @cached_method
544    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
545        return KeggApi.get_genes_by_ko_class(self, ko_class_id, org, offset, limit)
546
547    @cached_method
548    def get_genes_by_ko(self, ko_id, org):
549        return KeggApi.get_genes_by_ko(self, ko_id, org)
550
551    #########
552    # Pathway
553    #########
554
555    @cached_method
556    def get_genes_by_organism(self, organism, offset=None, limit=None):
557        return KeggApi.get_genes_by_organism(self, organism, offset=offset, limit=limit)
558
559    @cached_method
560    def get_number_of_genes_by_organism(self, organism):
561        return KeggApi.get_number_of_genes_by_organism(self, organism)
562
563    @cached_method
564    def get_pathways_by_genes(self, gene_list):
565        return KeggApi.get_pathways_by_genes(self, gene_list)
566
567    @cached_method
568    def get_pathways_by_enzymes(self, enzyme_list):
569        return KeggApi.get_pathways_by_enzymes(self, enzyme_list)
570
571    @cached_method
572    def get_pathways_by_compounds(self, compound_list):
573        return KeggApi.get_pathways_by_compounds(self, compound_list)
574
575    @cached_method
576    def get_pathways_by_drugs(self, drug_list):
577        return KeggApi.get_pathways_by_drugs(self, drug_list)
578
579    @cached_method
580    def get_pathways_by_glycans(self, glycan_list):
581        return KeggApi.get_pathways_by_glycans(self, glycan_list)
582
583    @cached_method
584    def get_pathways_by_reactions(self, reaction_list):
585        return KeggApi.get_pathways_by_reactions(self, reaction_list)
586
587    @cached_method
588    def get_pathways_by_kos(self, ko_list):
589        return KeggApi.get_pathways_by_kos(self, ko_list)
590
591    @cached_method
592    def get_elements_by_pathway(self, pathway_id):
593        return KeggApi.get_elements_by_pathway(self, pathway_id)
594
595    @cached_method
596    def get_genes_by_pathway(self, pathway_id):
597        return KeggApi.get_genes_by_pathway(self, pathway_id)
598
599    @cached_method
600    def get_enzymes_by_pathway(self, pathway_id):
601        return KeggApi.get_enzymes_by_pathway(self, pathway_id)
602
603    @cached_method
604    def get_compounds_by_pathway(self, pathway_id):
605        return KeggApi.get_compounds_by_pathway(self, pathway_id)
606
607    @cached_method
608    def get_drugs_by_pathway(self, pathway_id):
609        return KeggApi.get_drugs_by_pathway(self, pathway_id)
610
611    @cached_method
612    def get_glycans_by_pathway(self, pathway_id):
613        return KeggApi.get_glycans_by_pathway(self, pathway_id)
614
615    @cached_method
616    def get_reactions_by_pathway(self, pathway_id):
617        return KeggApi.get_reactions_by_pathway(self, pathway_id)
618
619    @cached_method
620    def get_kos_by_pathway(self, pathway_id):
621        return KeggApi.get_kos_by_pathway(self, pathway_id)
622
623    @cached_method
624    def get_genes_pathway_organism(self, org):
625        return KeggApi.get_genes_pathway_organism(self, org)
626
627
628def match_by_ids(ids, entries):
629    """
630
631    """
632
633    unmatched_ids = set(ids)
634    unmatched_entries = set(entries)
635
636    matched_ids = []
637    matched_entries = []
638
639    def match_add(search_id, entry):
640        """
641        Move search_id and entry to the matched lists.
642        """
643        matched_ids.append(search_id)
644        matched_entries.append(entry)
645
646        # Remove from the unmatched set
647        unmatched_ids.remove(search_id)
648        unmatched_entries.remove(entry)
649
650    def entry_split(entry_text):
651        line, _ = entry_text.split("\n", 1)
652        return line.split(None, 2)
653
654    entries_by_id = {}
655
656    for entry in entries:
657        _, eid, _ = entry_split(entry)
658        entries_by_id[eid] = entry
659
660    # First match full search ids
661    for search_id in list(unmatched_ids):
662        if search_id in entries_by_id:
663            entry = entries_by_id.pop(search_id)
664            match_add(search_id, entry)
665
666    # Second pass, split the search ids by ':' to db and identifier part,
667    # match by identifier
668    for search_id in list(unmatched_ids):
669        if ":" in search_id:
670            db_id, rest = search_id.split(":", 1)
671            if rest in entries_by_id:
672                entry = entries_by_id.pop(rest)
673                match_add(search_id, entry)
674
675    return matched_ids, matched_entries
676