1""" KEGG api interface. """ 2from __future__ import absolute_import 3 4import os 5import warnings 6from datetime import datetime 7from operator import itemgetter 8from contextlib import closing 9 10import six 11 12from orangecontrib.bioinformatics.kegg import caching 13from orangecontrib.bioinformatics.kegg.types import Link, BInfo, Definition, OrganismSummary 14from orangecontrib.bioinformatics.kegg.caching import touch_dir, cache_entry, cached_method 15from orangecontrib.bioinformatics.kegg.service import web_service 16 17# A list of all databases with names, abbreviations 18DATABASES = [ 19 ("KEGG Pathway", "pathway", "path", None), 20 ("KEGG Brite", "brite", "br", None), 21 ("KEGG Module", "module", "md", "M"), 22 ("KEGG Disease", "disease", "ds", "H"), 23 ("KEGG Drug", "drug", "dr", "D"), 24 ("KEGG Orthology", "orthology", "ko", "K"), 25 ("KEGG Genome", "genome", "genome", "T"), 26 ("KEGG Genomes", "genomes", "gn", "T"), 27 ("KEGG Genes", "genes", None, None), 28 ("KEGG Ligand", "ligand", "ligand", None), 29 ("KEGG Compound", "compound", "cpd", "C"), 30 ("KEGG Glycan", "glycan", "gl", "G"), 31 ("KEGG Reaction", "reaction", "rn", "R"), 32 ("KEGG RPair", "rpair", "rp", "RP"), 33 ("KEGG RClass", "rclass", "rc", "RC"), 34 ("KEGG Enzyme", "enzyme", "ec", "E"), 35] 36 37 38def _link_targets(links): 39 return sorted(set(map(itemgetter(1), links))) 40 41 42class KeggApi(object): 43 """ 44 An abstraction of a rest KEGG API. 45 """ 46 47 def __init__(self): 48 self.service = web_service() 49 50 def list_organisms(self): 51 """ 52 Return a list of all available organisms, 53 54 >>> api.list_organisms() # doctest: +ELLIPSIS 55 [OrganismSummary(entry_id='T01001', ... 56 """ 57 return list(map(OrganismSummary.from_str, self.service.list.organism.get().splitlines())) 58 59 def list_pathways(self, organism): 60 """ 61 Return a list of all available pathways for `organism` 62 63 >>> api.list_pathways("hsa") # doctest: +ELLIPSIS 64 [Definition(entry_id='path:hsa00010', ... 65 """ 66 return list(map(Definition.from_str, self.service.list.pathway(organism).get().splitlines())) 67 68 def list(self, db): 69 """ 70 Return a list of all available entries in database `db`. 71 """ 72 return list(map(Definition.from_str, self.service.list(db).get().splitlines())) 73 74 ####### 75 # DBGET 76 ####### 77 78 def info(self, db): 79 """ 80 Return info for database `db` 81 82 >>> print(api.info("pathway")) 83 BInfo(entry_id='path', definition='KEGG Pathway Database', ... 84 """ 85 result = self.service.info(db).get() 86 return BInfo.from_text(result) 87 88 def find(self, db, keywords): 89 """ 90 Search database 'db' for keywords. 91 """ 92 if isinstance(keywords, six.string_types): 93 keywords = [keywords] 94 95 return self.service.find(db)("+".join(keywords)).get() 96 97 def get(self, ids): 98 """ 99 Retrieve database entries for `ids` list. 100 """ 101 if not isinstance(ids, six.string_types): 102 # Sequence of ids 103 ids = "+".join(ids) 104 105 return self.service.get(ids).get() 106 107 def conv(self, target_db, source): 108 """ 109 Return a mapping from source to target_db ids as a list of two 110 tuples [(source_id, target_id), ...]. 111 112 """ 113 if not isinstance(source, six.string_types): 114 source = "+".join(source) 115 116 res = self.service.conv(target_db)(source).get() 117 return [tuple(line.split("\t")) for line in res.splitlines()] 118 119 def link(self, target_db, source_db=None, ids=None): 120 if not (source_db or ids): 121 raise ValueError("One of 'source_db' or 'ids' must be supplied") 122 if source_db and ids: 123 raise ValueError("Only one 'source_db' or 'ids' must be supplied") 124 125 if source_db: 126 result = self.service.link(target_db)(source_db).get() 127 else: 128 result = self.service.link(target_db)("+".join(ids)).get() 129 130 return list(map(Link._make, map(str.split, result.splitlines()))) 131 132 def get_genes_by_enzyme(self, enzyme_id, org): 133 return _link_targets(self.link(org, ids=[enzyme_id])) 134 135 def get_enzymes_by_gene(self, gene_id): 136 return _link_targets(self.link("ec", ids=[gene_id])) 137 138 def get_enzymes_by_compound(self, compound_id): 139 return _link_targets(self.link("ec", ids=[compound_id])) 140 141 def get_enzymes_by_glycan(self, glycan_id): 142 return _link_targets(self.link("ec", ids=[glycan_id])) 143 144 def get_enzymes_by_reaction(self, reaction_id): 145 return _link_targets(self.link("ec", ids=[reaction_id])) 146 147 def get_compounds_by_enzyme(self, enzyme_id): 148 return _link_targets(self.link("compound", ids=[enzyme_id])) 149 150 def get_compounds_by_reaction(self, reaction_id): 151 return _link_targets(self.link("compound", ids=[reaction_id])) 152 153 def get_glycans_by_enzyme(self, enzyme_id): 154 return _link_targets(self.link("gl", ids=[enzyme_id])) 155 156 def get_glycans_by_reaction(self, reaction_id): 157 return _link_targets(self.link("gl", ids=[reaction_id])) 158 159 def get_reactions_by_enzyme(self, enzyme_id): 160 return _link_targets(self.link("rn", ids=[enzyme_id])) 161 162 def get_reactions_by_compound(self, compound_id): 163 return _link_targets(self.link("rn", ids=[compound_id])) 164 165 def get_reactions_by_glycan(self, glycan_id): 166 return _link_targets(self.link("rn", ids=[glycan_id])) 167 168 ###### 169 # SSDB 170 ###### 171 172 # No replacement api in the KEGG REST api. 173 def get_best_best_neighbors_by_gene(self, genes_id, offset, limit): 174 raise NotImplementedError 175 176 def get_best_neighbors_by_gene(self, genes_id, offset, limit): 177 raise NotImplementedError 178 179 def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit): 180 raise NotImplementedError 181 182 def get_paralogs_by_gene(self, genes_id, offset, limit): 183 raise NotImplementedError 184 185 ####### 186 # Motif 187 ####### 188 189 # No replacement api in KEGG REST api 190 def get_motifs_by_gene(self, genes_id, db): 191 raise NotImplementedError 192 193 def get_genes_by_motifs(self, motif_id_list, offset, limit): 194 raise NotImplementedError 195 196 #### 197 # KO 198 #### 199 200 def get_ko_by_gene(self, genes_id): 201 raise NotImplementedError 202 203 def get_ko_by_ko_class(self, ko_class_id): 204 raise NotImplementedError 205 206 def get_genes_by_ko_class(self, ko_class_id, org, offset, limit): 207 raise NotImplementedError 208 209 def get_genes_by_ko(self, ko_id, org): 210 raise NotImplementedError 211 212 ######### 213 # Pathway 214 ######### 215 216 def mark_pathway_by_objects(self, pathway_id, object_id_list): 217 raise NotImplementedError 218 219 def color_pathway_by_objects(self, pathway_id, object_id_list, fg_color_list, bg_color_list): 220 raise NotImplementedError 221 222 def color_pathway_by_elements(self, pathway_id, element_id_list, fg_color_list, bg_color_list): 223 raise NotImplementedError 224 225 def get_html_of_marked_pathway_by_objects(self, pathway_id, object_id_list): 226 raise NotImplementedError 227 228 def get_html_of_colored_pathway_by_objects(self, pathway_id, object_id_list, fg_color_list, bg_color_list): 229 raise NotImplementedError 230 231 def get_html_of_colored_pathway_by_elements(self, pathway_id, element_id_list, fg_color_list, bg_color_list): 232 raise NotImplementedError 233 234 def get_references_by_pathway(self, pathway_id): 235 return self.service.get_references_by_pathway(pathway_id) 236 237 def get_element_relations_by_pathway(self, pathway_id): 238 return self.service.get_element_relations_by_pathway(pathway_id) 239 240 def get_genes_by_organism(self, organism, offset=None, limit=None): 241 if offset is not None: 242 raise NotImplementedError("offset is no longer supported") 243 if limit is not None: 244 raise NotImplementedError("limit is no longer supported.") 245 246 res = self.service.list(organism).get().splitlines() 247 return [r.split(None, 1)[0] for r in res] 248 249 def get_number_of_genes_by_organism(self, organism): 250 raise NotImplementedError 251 252 #################### 253 # Objects by pathway 254 #################### 255 256 def get_elements_by_pathway(self, pathway_id): 257 raise NotImplementedError 258 259 def get_genes_by_pathway(self, pathway_id): 260 return _link_targets(self.link("genes", ids=[pathway_id])) 261 262 def get_enzymes_by_pathway(self, pathway_id): 263 return _link_targets(self.link("ec", ids=[pathway_id])) 264 265 def get_compounds_by_pathway(self, pathway_id): 266 return _link_targets(self.link("compound", ids=[pathway_id])) 267 268 def get_drugs_by_pathway(self, pathway_id): 269 return _link_targets(self.link("drug", ids=[pathway_id])) 270 271 def get_glycans_by_pathway(self, pathway_id): 272 return _link_targets(self.link("gl", ids=[pathway_id])) 273 274 def get_reactions_by_pathway(self, pathway_id): 275 return _link_targets(self.link("rn", ids=[pathway_id])) 276 277 def get_kos_by_pathway(self, pathway_id): 278 return _link_targets(self.link("ko", ids=[pathway_id])) 279 280 ############################################# 281 # Pathways and genes of a specific organism # 282 ############################################# 283 284 def get_genes_pathway_organism(self, organism): 285 l = self.link("pathway", organism) 286 return list(map(tuple, l)) 287 288 ##################### 289 # Pathways by objects 290 ##################### 291 292 # These functions returned results intersections. 293 def get_pathways_by_genes(self, gene_list): 294 raise NotImplementedError 295 296 def get_pathways_by_enzymes(self, enzyme_list): 297 raise NotImplementedError 298 299 def get_pathways_by_compounds(self, compound_list): 300 raise NotImplementedError 301 302 def get_pathways_by_drugs(self, drug_list): 303 raise NotImplementedError 304 305 def get_pathways_by_glycans(self, glycan_list): 306 raise NotImplementedError 307 308 def get_pathways_by_reactions(self, reaction_list): 309 raise NotImplementedError 310 311 def get_pathways_by_kos(self, ko_list): 312 raise NotImplementedError 313 314 ########################## 315 # Relations among pathways 316 ########################## 317 318 def get_linked_pathways(self, pathway_id): 319 if not pathway_id.startswith("path:"): 320 pathway_id = "path:" + pathway_id 321 return _link_targets(self.link("pathway", ids=[pathway_id])) 322 323 324""" 325KEGG api with caching 326""" 327 328 329try: 330 from functools import lru_cache 331except ImportError: 332 # TODO: move a copy of lru_cache in .caching if distributing this as a 333 # standalone package 334 from Orange.utils import lru_cache 335 336 337class CachedKeggApi(KeggApi): 338 def __init__(self, store=None): 339 KeggApi.__init__(self) 340 if store is None: 341 self.store = {} 342 343 # Needed API for cached decorator. 344 def cache_store(self): 345 from . import conf 346 347 path = conf.params["cache.path"] 348 touch_dir(path) 349 return caching.Sqlite3Store(os.path.join(path, "kegg_api_cache_2.sqlite3")) 350 351 def last_modified(self, args, kwargs=None): 352 return getattr(self, "default_release", "") 353 354 def set_default_release(self, release): 355 self.default_release = release 356 357 @cached_method 358 def list_organisms(self): 359 return KeggApi.list_organisms(self) 360 361 @cached_method 362 def list_pathways(self, organism): 363 return KeggApi.list_pathways(self, organism) 364 365 @cached_method 366 def list(self, db): 367 return KeggApi.list(self, db) 368 369 @lru_cache() # not persistently cached 370 def info(self, db): 371 return KeggApi.info(self, db) 372 373 @cached_method 374 def find(self, db, keywords): 375 return KeggApi.find(self, db, keywords) 376 377 @cached_method 378 def get(self, ids): 379 if not isinstance(ids, six.string_types): 380 return self._batch_get(ids) 381 else: 382 return KeggApi.get(self, ids) 383 384 @cached_method 385 def link(self, target_db, source_db=None, ids=None): 386 return KeggApi.link(self, target_db, source_db, ids) 387 388 def _batch_get(self, ids): 389 if len(ids) > 10: 390 raise ValueError("Can batch at most 10 ids at a time.") 391 392 get = self.get 393 uncached = [] 394 unmatched = set() 395 396 with closing(get.cache_store()) as store: 397 # Which ids are already cached 398 # TODO: Invalidate entries by release string. 399 for id in ids: 400 key = get.key_from_args((id,)) 401 if not get.key_has_valid_cache(key, store): 402 uncached.append(id) 403 404 if uncached: 405 # in case there are duplicate ids 406 uncached = sorted(set(uncached)) 407 408 rval = KeggApi.get(self, uncached) 409 410 if rval is not None: 411 entries = rval.split("///\n") 412 else: 413 entries = [] 414 415 if entries and not entries[-1].strip(): 416 # Delete the last single newline entry if present 417 del entries[-1] 418 419 if len(entries) != len(uncached): 420 new_uncached, entries = match_by_ids(uncached, entries) 421 unmatched = set(uncached) - set(new_uncached) 422 uncached = new_uncached 423 warnings.warn("Unable to match entries for keys: %s." % ", ".join(map(repr, unmatched))) 424 425 with closing(get.cache_store()) as store: 426 for id, entry in zip(uncached, entries): 427 key = get.key_from_args((id,)) 428 if entry is not None: 429 entry = entry + "///\n" 430 store[key] = cache_entry(entry, mtime=datetime.now()) 431 432 # Finally join all the results, but drop all None objects 433 434 with closing(get.cache_store()): 435 keys = [get.key_from_args((id,)) for id in ids] 436 entries = [store[key].value for key in keys] 437 438 entries = filter(lambda e: e is not None, entries) 439 440 rval = "".join(entries) 441 return rval 442 443 @cached_method 444 def conv(self, target_db, source): 445 return KeggApi.conv(self, target_db, source) 446 447 ######## 448 # LinkDB 449 ######## 450 451 @cached_method 452 def get_genes_by_enzyme(self, enzyme_id, org): 453 return KeggApi.get_genes_by_enzyme(self, enzyme_id, org) 454 455 @cached_method 456 def get_enzymes_by_gene(self, genes_id): 457 return KeggApi.get_enzymes_by_gene(self, genes_id) 458 459 @cached_method 460 def get_enzymes_by_compound(self, compound_id): 461 return KeggApi.get_enzymes_by_compound(self, compound_id) 462 463 @cached_method 464 def get_enzymes_by_glycan(self, glycan_id): 465 return KeggApi.get_enzymes_by_glycan(self, glycan_id) 466 467 @cached_method 468 def get_enzymes_by_reaction(self, reaction_id): 469 return KeggApi.get_enzymes_by_reaction(self, reaction_id) 470 471 @cached_method 472 def get_compounds_by_enzyme(self, enzyme_id): 473 return KeggApi.get_compounds_by_enzyme(self, enzyme_id) 474 475 @cached_method 476 def get_compounds_by_reaction(self, reaction_id): 477 return KeggApi.get_compounds_by_reaction(self, reaction_id) 478 479 @cached_method 480 def get_glycans_by_enzyme(self, enzyme_id): 481 return KeggApi.get_glycans_by_enzyme(self, enzyme_id) 482 483 @cached_method 484 def get_glycans_by_reaction(self, reaction_id): 485 return KeggApi.get_glycans_by_reaction(self, reaction_id) 486 487 @cached_method 488 def get_reactions_by_enzyme(self, enzyme_id): 489 return KeggApi.get_reactions_by_enzyme(self, enzyme_id) 490 491 @cached_method 492 def get_reactions_by_compound(self, compound_id): 493 return KeggApi.get_reactions_by_compound(self, compound_id) 494 495 @cached_method 496 def get_reactions_by_glycan(self, glycan_id): 497 return KeggApi.get_reactions_by_glycan(self, glycan_id) 498 499 ###### 500 # SSDB 501 ###### 502 503 @cached_method 504 def get_best_best_neighbors_by_gene(self, genes_id, offset, limit): 505 return KeggApi.get_best_best_neighbors_by_gene(self, genes_id, offset, limit) 506 507 @cached_method 508 def get_best_neighbors_by_gene(self, genes_id, offset, limit): 509 return KeggApi.get_best_neighbors_by_gene(self, genes_id, offset, limit) 510 511 @cached_method 512 def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit): 513 return KeggApi.get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit) 514 515 @cached_method 516 def get_paralogs_by_gene(self, genes_id, offset, limit): 517 return KeggApi.get_paralogs_by_gene(self, genes_id, offset, limit) 518 519 ####### 520 # Motif 521 ####### 522 523 @cached_method 524 def get_motifs_by_gene(self, genes_id, db): 525 return KeggApi.get_motifs_by_gene(self, genes_id, db) 526 527 @cached_method 528 def get_genes_by_motifs(self, motif_id_list, offset, limit): 529 return KeggApi.get_genes_by_motifs(self, motif_id_list, offset, limit) 530 531 #### 532 # KO 533 #### 534 535 @cached_method 536 def get_ko_by_gene(self, genes_id): 537 return KeggApi.get_ko_by_gene(self, genes_id) 538 539 @cached_method 540 def get_ko_by_ko_class(self, ko_class_id): 541 return KeggApi.service.get_ko_by_ko_class(self, ko_class_id) 542 543 @cached_method 544 def get_genes_by_ko_class(self, ko_class_id, org, offset, limit): 545 return KeggApi.get_genes_by_ko_class(self, ko_class_id, org, offset, limit) 546 547 @cached_method 548 def get_genes_by_ko(self, ko_id, org): 549 return KeggApi.get_genes_by_ko(self, ko_id, org) 550 551 ######### 552 # Pathway 553 ######### 554 555 @cached_method 556 def get_genes_by_organism(self, organism, offset=None, limit=None): 557 return KeggApi.get_genes_by_organism(self, organism, offset=offset, limit=limit) 558 559 @cached_method 560 def get_number_of_genes_by_organism(self, organism): 561 return KeggApi.get_number_of_genes_by_organism(self, organism) 562 563 @cached_method 564 def get_pathways_by_genes(self, gene_list): 565 return KeggApi.get_pathways_by_genes(self, gene_list) 566 567 @cached_method 568 def get_pathways_by_enzymes(self, enzyme_list): 569 return KeggApi.get_pathways_by_enzymes(self, enzyme_list) 570 571 @cached_method 572 def get_pathways_by_compounds(self, compound_list): 573 return KeggApi.get_pathways_by_compounds(self, compound_list) 574 575 @cached_method 576 def get_pathways_by_drugs(self, drug_list): 577 return KeggApi.get_pathways_by_drugs(self, drug_list) 578 579 @cached_method 580 def get_pathways_by_glycans(self, glycan_list): 581 return KeggApi.get_pathways_by_glycans(self, glycan_list) 582 583 @cached_method 584 def get_pathways_by_reactions(self, reaction_list): 585 return KeggApi.get_pathways_by_reactions(self, reaction_list) 586 587 @cached_method 588 def get_pathways_by_kos(self, ko_list): 589 return KeggApi.get_pathways_by_kos(self, ko_list) 590 591 @cached_method 592 def get_elements_by_pathway(self, pathway_id): 593 return KeggApi.get_elements_by_pathway(self, pathway_id) 594 595 @cached_method 596 def get_genes_by_pathway(self, pathway_id): 597 return KeggApi.get_genes_by_pathway(self, pathway_id) 598 599 @cached_method 600 def get_enzymes_by_pathway(self, pathway_id): 601 return KeggApi.get_enzymes_by_pathway(self, pathway_id) 602 603 @cached_method 604 def get_compounds_by_pathway(self, pathway_id): 605 return KeggApi.get_compounds_by_pathway(self, pathway_id) 606 607 @cached_method 608 def get_drugs_by_pathway(self, pathway_id): 609 return KeggApi.get_drugs_by_pathway(self, pathway_id) 610 611 @cached_method 612 def get_glycans_by_pathway(self, pathway_id): 613 return KeggApi.get_glycans_by_pathway(self, pathway_id) 614 615 @cached_method 616 def get_reactions_by_pathway(self, pathway_id): 617 return KeggApi.get_reactions_by_pathway(self, pathway_id) 618 619 @cached_method 620 def get_kos_by_pathway(self, pathway_id): 621 return KeggApi.get_kos_by_pathway(self, pathway_id) 622 623 @cached_method 624 def get_genes_pathway_organism(self, org): 625 return KeggApi.get_genes_pathway_organism(self, org) 626 627 628def match_by_ids(ids, entries): 629 """ 630 631 """ 632 633 unmatched_ids = set(ids) 634 unmatched_entries = set(entries) 635 636 matched_ids = [] 637 matched_entries = [] 638 639 def match_add(search_id, entry): 640 """ 641 Move search_id and entry to the matched lists. 642 """ 643 matched_ids.append(search_id) 644 matched_entries.append(entry) 645 646 # Remove from the unmatched set 647 unmatched_ids.remove(search_id) 648 unmatched_entries.remove(entry) 649 650 def entry_split(entry_text): 651 line, _ = entry_text.split("\n", 1) 652 return line.split(None, 2) 653 654 entries_by_id = {} 655 656 for entry in entries: 657 _, eid, _ = entry_split(entry) 658 entries_by_id[eid] = entry 659 660 # First match full search ids 661 for search_id in list(unmatched_ids): 662 if search_id in entries_by_id: 663 entry = entries_by_id.pop(search_id) 664 match_add(search_id, entry) 665 666 # Second pass, split the search ids by ':' to db and identifier part, 667 # match by identifier 668 for search_id in list(unmatched_ids): 669 if ":" in search_id: 670 db_id, rest = search_id.split(":", 1) 671 if rest in entries_by_id: 672 entry = entries_by_id.pop(rest) 673 match_add(search_id, entry) 674 675 return matched_ids, matched_entries 676