1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3""" 4BIOM Table (:mod:`biom.table`) 5============================== 6 7The biom-format project provides rich ``Table`` objects to support use of the 8BIOM file format. The objects encapsulate matrix data (such as OTU counts) and 9abstract the interaction away from the programmer. 10 11.. currentmodule:: biom.table 12 13Classes 14------- 15 16.. autosummary:: 17 :toctree: generated/ 18 19 Table 20 21Examples 22-------- 23First, lets create a toy table to play around with. For this example, we're 24going to construct a 10x4 `Table`, or one that has 10 observations and 4 25samples. Each observation and sample will be given an arbitrary but unique 26name. We'll also add on some metadata. 27 28>>> import numpy as np 29>>> from biom.table import Table 30>>> data = np.arange(40).reshape(10, 4) 31>>> sample_ids = ['S%d' % i for i in range(4)] 32>>> observ_ids = ['O%d' % i for i in range(10)] 33>>> sample_metadata = [{'environment': 'A'}, {'environment': 'B'}, 34... {'environment': 'A'}, {'environment': 'B'}] 35>>> observ_metadata = [{'taxonomy': ['Bacteria', 'Firmicutes']}, 36... {'taxonomy': ['Bacteria', 'Firmicutes']}, 37... {'taxonomy': ['Bacteria', 'Proteobacteria']}, 38... {'taxonomy': ['Bacteria', 'Proteobacteria']}, 39... {'taxonomy': ['Bacteria', 'Proteobacteria']}, 40... {'taxonomy': ['Bacteria', 'Bacteroidetes']}, 41... {'taxonomy': ['Bacteria', 'Bacteroidetes']}, 42... {'taxonomy': ['Bacteria', 'Firmicutes']}, 43... {'taxonomy': ['Bacteria', 'Firmicutes']}, 44... {'taxonomy': ['Bacteria', 'Firmicutes']}] 45>>> table = Table(data, observ_ids, sample_ids, observ_metadata, 46... sample_metadata, table_id='Example Table') 47 48Now that we have a table, let's explore it at a high level first. 49 50>>> table 5110 x 4 <class 'biom.table.Table'> with 39 nonzero entries (97% dense) 52>>> print(table) # doctest: +NORMALIZE_WHITESPACE 53# Constructed from biom file 54#OTU ID S0 S1 S2 S3 55O0 0.0 1.0 2.0 3.0 56O1 4.0 5.0 6.0 7.0 57O2 8.0 9.0 10.0 11.0 58O3 12.0 13.0 14.0 15.0 59O4 16.0 17.0 18.0 19.0 60O5 20.0 21.0 22.0 23.0 61O6 24.0 25.0 26.0 27.0 62O7 28.0 29.0 30.0 31.0 63O8 32.0 33.0 34.0 35.0 64O9 36.0 37.0 38.0 39.0 65>>> print(table.ids()) # doctest: +NORMALIZE_WHITESPACE 66['S0' 'S1' 'S2' 'S3'] 67>>> print(table.ids(axis='observation')) # doctest: +NORMALIZE_WHITESPACE 68['O0' 'O1' 'O2' 'O3' 'O4' 'O5' 'O6' 'O7' 'O8' 'O9'] 69>>> print(table.nnz) # number of nonzero entries 7039 71 72While it's fun to just poke at the table, let's dig deeper. First, we're going 73to convert `table` into relative abundances (within each sample), and then 74filter `table` to just the samples associated with environment 'A'. The 75filtering gets fancy: we can pass in an arbitrary function to determine what 76samples we want to keep. This function must accept a sparse vector of values, 77the corresponding ID and the corresponding metadata, and should return ``True`` 78or ``False``, where ``True`` indicates that the vector should be retained. 79 80>>> normed = table.norm(axis='sample', inplace=False) 81>>> filter_f = lambda values, id_, md: md['environment'] == 'A' 82>>> env_a = normed.filter(filter_f, axis='sample', inplace=False) 83>>> print(env_a) # doctest: +NORMALIZE_WHITESPACE 84# Constructed from biom file 85#OTU ID S0 S2 86O0 0.0 0.01 87O1 0.0222222222222 0.03 88O2 0.0444444444444 0.05 89O3 0.0666666666667 0.07 90O4 0.0888888888889 0.09 91O5 0.111111111111 0.11 92O6 0.133333333333 0.13 93O7 0.155555555556 0.15 94O8 0.177777777778 0.17 95O9 0.2 0.19 96 97But, what if we wanted individual tables per environment? While we could just 98perform some fancy iteration, we can instead just rely on `Table.partition` for 99these operations. `partition`, like `filter`, accepts a function. However, the 100`partition` method only passes the corresponding ID and metadata to the 101function. The function should return what partition the data are a part of. 102Within this example, we're also going to sum up our tables over the partitioned 103samples. Please note that we're using the original table (ie, not normalized) 104here. 105 106>>> part_f = lambda id_, md: md['environment'] 107>>> env_tables = table.partition(part_f, axis='sample') 108>>> for partition, env_table in env_tables: 109... print(partition, env_table.sum('sample')) 110A [ 180. 200.] 111B [ 190. 210.] 112 113For this last example, and to highlight a bit more functionality, we're going 114to first transform the table such that all multiples of three will be retained, 115while all non-multiples of three will get set to zero. Following this, we'll 116then collpase the table by taxonomy, and then convert the table into 117presence/absence data. 118 119First, let's setup the transform. We're going to define a function that takes 120the modulus of every value in the vector, and see if it is equal to zero. If it 121is equal to zero, we'll keep the value, otherwise we'll set the value to zero. 122 123>>> transform_f = lambda v,i,m: np.where(v % 3 == 0, v, 0) 124>>> mult_of_three = tform = table.transform(transform_f, inplace=False) 125>>> print(mult_of_three) # doctest: +NORMALIZE_WHITESPACE 126# Constructed from biom file 127#OTU ID S0 S1 S2 S3 128O0 0.0 0.0 0.0 3.0 129O1 0.0 0.0 6.0 0.0 130O2 0.0 9.0 0.0 0.0 131O3 12.0 0.0 0.0 15.0 132O4 0.0 0.0 18.0 0.0 133O5 0.0 21.0 0.0 0.0 134O6 24.0 0.0 0.0 27.0 135O7 0.0 0.0 30.0 0.0 136O8 0.0 33.0 0.0 0.0 137O9 36.0 0.0 0.0 39.0 138 139Next, we're going to collapse the table over the phylum level taxon. To do 140this, we're going to define a helper variable for the index position of the 141phylum (see the construction of the table above). Next, we're going to pass 142this to `Table.collapse`, and since we want to collapse over the observations, 143we'll need to specify 'observation' as the axis. 144 145>>> phylum_idx = 1 146>>> collapse_f = lambda id_, md: '; '.join(md['taxonomy'][:phylum_idx + 1]) 147>>> collapsed = mult_of_three.collapse(collapse_f, axis='observation') 148>>> print(collapsed) # doctest: +NORMALIZE_WHITESPACE 149# Constructed from biom file 150#OTU ID S0 S1 S2 S3 151Bacteria; Firmicutes 7.2 6.6 7.2 8.4 152Bacteria; Proteobacteria 4.0 3.0 6.0 5.0 153Bacteria; Bacteroidetes 12.0 10.5 0.0 13.5 154 155Finally, let's convert the table to presence/absence data. 156 157>>> pa = collapsed.pa() 158>>> print(pa) # doctest: +NORMALIZE_WHITESPACE 159# Constructed from biom file 160#OTU ID S0 S1 S2 S3 161Bacteria; Firmicutes 1.0 1.0 1.0 1.0 162Bacteria; Proteobacteria 1.0 1.0 1.0 1.0 163Bacteria; Bacteroidetes 1.0 1.0 0.0 1.0 164 165""" 166 167# ----------------------------------------------------------------------------- 168# Copyright (c) 2011-2020, The BIOM Format Development Team. 169# 170# Distributed under the terms of the Modified BSD License. 171# 172# The full license is in the file COPYING.txt, distributed with this software. 173# ----------------------------------------------------------------------------- 174 175from __future__ import division 176import numpy as np 177import scipy.stats 178from copy import deepcopy 179from datetime import datetime 180from json import dumps 181from functools import reduce, partial 182from operator import itemgetter, or_ 183from future.builtins import zip 184from future.utils import viewitems 185from collections import defaultdict, Hashable, Iterable 186from numpy import ndarray, asarray, zeros, newaxis 187from scipy.sparse import (coo_matrix, csc_matrix, csr_matrix, isspmatrix, 188 vstack, hstack) 189import pandas as pd 190import re 191import six 192from future.utils import string_types as _future_string_types 193from biom.exception import (TableException, UnknownAxisError, UnknownIDError, 194 DisjointIDError) 195from biom.util import (get_biom_format_version_string, 196 get_biom_format_url_string, flatten, natsort, 197 prefer_self, index_list, H5PY_VLEN_STR, HAVE_H5PY, 198 __format_version__) 199from biom.err import errcheck 200from ._filter import _filter 201from ._transform import _transform 202from ._subsample import _subsample 203 204 205if not six.PY3: 206 string_types = list(_future_string_types) 207 string_types.append(str) 208 string_types.append(unicode) # noqa 209 string_types = tuple(string_types) 210else: 211 string_types = _future_string_types 212 213 214__author__ = "Daniel McDonald" 215__copyright__ = "Copyright 2011-2020, The BIOM Format Development Team" 216__credits__ = ["Daniel McDonald", "Jai Ram Rideout", "Greg Caporaso", 217 "Jose Clemente", "Justin Kuczynski", "Adam Robbins-Pianka", 218 "Joshua Shorenstein", "Jose Antonio Navas Molina", 219 "Jorge Cañardo Alastuey", "Steven Brown"] 220__license__ = "BSD" 221__url__ = "http://biom-format.org" 222__maintainer__ = "Daniel McDonald" 223__email__ = "daniel.mcdonald@colorado.edu" 224 225 226MATRIX_ELEMENT_TYPE = {'int': int, 'float': float, 'unicode': str, 227 u'int': int, u'float': float, u'unicode': str} 228 229 230def _identify_bad_value(dtype, fields): 231 """Identify the first value which cannot be cast 232 233 Paramters 234 --------- 235 dtype : type 236 A type to cast to 237 fields : Iterable of str 238 A series of str to cast into dtype 239 240 Returns 241 ------- 242 str or None 243 A value that cannot be cast 244 int or None 245 The index of the value that cannot be cast 246 """ 247 badval = None 248 badidx = None 249 for idx, v in enumerate(fields): 250 try: 251 dtype(v) 252 except: # noqa 253 badval = v 254 badidx = idx 255 break 256 return (badval, badidx) 257 258 259def general_parser(x): 260 if isinstance(x, bytes): 261 x = x.decode('utf8') 262 return x 263 264 265def vlen_list_of_str_parser(value): 266 """Parses the taxonomy value""" 267 new_value = [] 268 for v in value: 269 if v: 270 if isinstance(v, bytes): 271 v = v.decode('utf8') 272 new_value.append(v) 273 274 return new_value if new_value else None 275 276 277def general_formatter(grp, header, md, compression): 278 """Creates a dataset for a general atomic type category""" 279 shape = (len(md),) 280 name = 'metadata/%s' % header 281 dtypes = [type(m[header]) for m in md] 282 283 if set(dtypes).issubset(set(string_types)): 284 grp.create_dataset(name, shape=shape, 285 dtype=H5PY_VLEN_STR, 286 data=[m[header].encode('utf8') for m in md], 287 compression=compression) 288 elif set(dtypes).issubset({list, tuple}): 289 vlen_list_of_str_formatter(grp, header, md, compression) 290 else: 291 formatted = [] 292 dtypes_used = [] 293 for dt, m in zip(dtypes, md): 294 val = m[header] 295 if val is None: 296 val = '\0' 297 dt = str 298 299 if dt in string_types: 300 val = val.encode('utf8') 301 302 formatted.append(val) 303 dtypes_used.append(dt) 304 305 if set(dtypes_used).issubset(set(string_types)): 306 dtype_to_use = H5PY_VLEN_STR 307 else: 308 dtype_to_use = None 309 310 # try our best... 311 grp.create_dataset( 312 name, shape=(len(md),), 313 dtype=dtype_to_use, 314 data=formatted, 315 compression=compression) 316 317 318def vlen_list_of_str_formatter(grp, header, md, compression): 319 """Creates a (N, ?) vlen str dataset""" 320 # It is possible that the value for some sample/observation 321 # is None. In that case, we still need to see them as 322 # iterables, but their length will be 0 323 iterable_checks = [] 324 lengths = [] 325 for m in md: 326 if m[header] is None: 327 iterable_checks.append(True) 328 elif isinstance(m.get(header), str): 329 iterable_checks.append(False) 330 else: 331 iterable_checks.append( 332 isinstance(m.get(header, []), Iterable)) 333 lengths.append(len(m[header])) 334 335 if not np.all(iterable_checks): 336 if header == 'taxonomy': 337 # attempt to handle the general case issue where the taxonomy 338 # was not split on semicolons and represented as a flat string 339 # instead of a list 340 def split_and_strip(i): 341 parts = i.split(';') 342 return [p.strip() for p in parts] 343 try: 344 new_md = [] 345 lengths = [] 346 for m in md: 347 parts = split_and_strip(m[header]) 348 new_md.append({header: parts}) 349 lengths.append(len(parts)) 350 md = new_md 351 except: # noqa 352 raise TypeError("Category '%s' is not formatted properly. The " 353 "most common issue is when 'taxonomy' is " 354 "represented as a flat string instead of a " 355 "list. An attempt was made to split this " 356 "field on a ';' to coerce it into a list but " 357 "it failed. An example entry (which is not " 358 "assured to be the problematic entry) is " 359 "below:\n%s" % (header, md[0][header])) 360 else: 361 raise TypeError( 362 "Category %s not formatted correctly. Did you pass" 363 " --process-obs-metadata taxonomy when converting " 364 " from tsv? Please see Table.to_hdf5 docstring for" 365 " more information" % header) 366 367 max_list_len = max(lengths) 368 shape = (len(md), max_list_len) 369 data = np.empty(shape, dtype=object) 370 for i, m in enumerate(md): 371 if m[header] is None: 372 continue 373 value = np.asarray(m[header]) 374 data[i, :len(value)] = [v.encode('utf8') for v in value] 375 # Change the None entries on data to empty strings "" 376 data = np.where(data == np.array(None), "", data) 377 grp.create_dataset( 378 'metadata/%s' % header, shape=shape, 379 dtype=H5PY_VLEN_STR, data=data, 380 compression=compression) 381 382 383class Table(object): 384 385 """The (canonically pronounced 'teh') Table. 386 387 Give in to the power of the Table! 388 389 Creates an in-memory representation of a BIOM file. BIOM version 1.0 is 390 based on JSON to provide the overall structure for the format while 391 versions 2.0 and 2.1 are based on HDF5. For more information see [1]_ 392 and [2]_ 393 394 Paramaters 395 ---------- 396 data : array_like 397 An (N,M) sample by observation matrix represented as one of these 398 types: 399 An 1-dimensional array of values 400 An n-dimensional array of values 401 An empty list 402 A list of numpy arrays 403 A list of dict 404 A list of sparse matrices 405 A dictionary of values 406 A list of lists 407 A sparse matrix of values 408 observation_ids : array_like of str 409 A (N,) dataset of the observation IDs, where N is the total number 410 of IDs 411 sample_ids : array_like of str 412 A (M,) dataset of the sample IDs, where M is the total number of IDs 413 observation_metadata : list of dicts, optional 414 per observation dictionary of annotations where every key represents a 415 metadata field that contains specific metadata information, 416 ie taxonomy, KEGG pathway, etc 417 sample_metadata : array_like of dicts, optional 418 per sample dictionary of annotations where every key represents a 419 metadata field that contains sample specific metadata information, ie 420 table_id : str, optional 421 A field that can be used to identify the table 422 type : {None, "OTU table", "Pathway table", "Function table", 423 "Ortholog table", "Gene table", "Metabolite table", "Taxon table"} 424 The type of table represented 425 create_date : str, optional 426 Date that this table was built 427 generated_by : str, optional 428 Individual who built the table 429 observation_group_metadata : list, optional 430 group that contains observation specific group metadata information 431 (e.g., phylogenetic tree) 432 sample_group_metadata : list, optional 433 group that contains sample specific group metadata information 434 (e.g., relationships between samples) 435 436 Attributes 437 ---------- 438 shape 439 dtype 440 nnz 441 matrix_data 442 type 443 table_id 444 create_date 445 generated_by 446 format_version 447 448 Raises 449 ------ 450 TableException 451 When an invalid table type is provided. 452 453 References 454 ---------- 455 .. [1] http://biom-format.org/documentation/biom_format.html 456 .. [2] D. McDonald, et al. "The Biological Observation Matrix (BIOM) format 457 or: how I learned to stop worrying and love the ome-ome" 458 GigaScience 2012 1:7 459 """ 460 461 def __init__(self, data, observation_ids, sample_ids, 462 observation_metadata=None, sample_metadata=None, 463 table_id=None, type=None, create_date=None, generated_by=None, 464 observation_group_metadata=None, sample_group_metadata=None, 465 **kwargs): 466 467 self.type = type 468 self.table_id = table_id 469 self.create_date = create_date 470 self.generated_by = generated_by 471 self.format_version = __format_version__ 472 473 if not isspmatrix(data): 474 shape = (len(observation_ids), len(sample_ids)) 475 input_is_dense = kwargs.get('input_is_dense', False) 476 self._data = Table._to_sparse(data, input_is_dense=input_is_dense, 477 shape=shape) 478 else: 479 self._data = data.tocsr() 480 481 self._data = self._data.astype(float) 482 483 # using object to allow for variable length strings 484 self._sample_ids = np.asarray(sample_ids, dtype=object) 485 self._observation_ids = np.asarray(observation_ids, dtype=object) 486 487 if sample_metadata is not None: 488 # not m will evaluate True if the object tested is None or 489 # an empty dict, etc. 490 if {not m for m in sample_metadata} == {True, }: 491 self._sample_metadata = None 492 else: 493 self._sample_metadata = tuple(sample_metadata) 494 else: 495 self._sample_metadata = None 496 497 if observation_metadata is not None: 498 # not m will evaluate True if the object tested is None or 499 # an empty dict, etc. 500 if {not m for m in observation_metadata} == {True, }: 501 self._observation_metadata = None 502 else: 503 self._observation_metadata = tuple(observation_metadata) 504 else: 505 self._observation_metadata = None 506 507 self._sample_group_metadata = sample_group_metadata 508 self._observation_group_metadata = observation_group_metadata 509 510 errcheck(self) 511 512 # These will be set by _index_ids() 513 self._sample_index = None 514 self._obs_index = None 515 516 self._cast_metadata() 517 self._index_ids() 518 519 def _index_ids(self): 520 """Sets lookups {id:index in _data}. 521 522 Should only be called in constructor as this modifies state. 523 """ 524 self._sample_index = index_list(self._sample_ids) 525 self._obs_index = index_list(self._observation_ids) 526 527 def _index(self, axis='sample'): 528 """Return the index lookups of the given axis 529 530 Parameters 531 ---------- 532 axis : {'sample', 'observation'}, optional 533 Axis to get the index dict. Defaults to 'sample' 534 535 Returns 536 ------- 537 dict 538 lookups {id:index} 539 540 Raises 541 ------ 542 UnknownAxisError 543 If provided an unrecognized axis. 544 """ 545 if axis == 'sample': 546 return self._sample_index 547 elif axis == 'observation': 548 return self._obs_index 549 else: 550 raise UnknownAxisError(axis) 551 552 def _conv_to_self_type(self, vals, transpose=False, dtype=None): 553 """For converting vectors to a compatible self type""" 554 if dtype is None: 555 dtype = self.dtype 556 557 if isspmatrix(vals): 558 return vals 559 else: 560 return Table._to_sparse(vals, transpose, dtype) 561 562 @staticmethod 563 def _to_dense(vec): 564 """Converts a row/col vector to a dense numpy array. 565 566 Always returns a 1-D row vector for consistency with numpy iteration 567 over arrays. 568 """ 569 dense_vec = vec.toarray() 570 571 if vec.shape == (1, 1): 572 # Handle the special case where we only have a single element, but 573 # we don't want to return a numpy scalar / 0-d array. We still want 574 # to return a vector of length 1. 575 return dense_vec.reshape(1) 576 else: 577 return np.squeeze(dense_vec) 578 579 @staticmethod 580 def _to_sparse(values, transpose=False, dtype=float, input_is_dense=False, 581 shape=None): 582 """Try to return a populated scipy.sparse matrix. 583 584 NOTE: assumes the max value observed in row and col defines the size of 585 the matrix. 586 """ 587 # if it is a vector 588 if isinstance(values, ndarray) and len(values.shape) == 1: 589 if transpose: 590 mat = nparray_to_sparse(values[:, newaxis], dtype) 591 else: 592 mat = nparray_to_sparse(values, dtype) 593 return mat 594 if isinstance(values, ndarray): 595 if transpose: 596 mat = nparray_to_sparse(values.T, dtype) 597 else: 598 mat = nparray_to_sparse(values, dtype) 599 return mat 600 # the empty list 601 elif isinstance(values, list) and len(values) == 0: 602 return coo_matrix((0, 0)) 603 # list of np vectors 604 elif isinstance(values, list) and isinstance(values[0], ndarray): 605 mat = list_nparray_to_sparse(values, dtype) 606 if transpose: 607 mat = mat.T 608 return mat 609 # list of dicts, each representing a row in row order 610 elif isinstance(values, list) and isinstance(values[0], dict): 611 mat = list_dict_to_sparse(values, dtype) 612 if transpose: 613 mat = mat.T 614 return mat 615 # list of scipy.sparse matrices, each representing a row in row order 616 elif isinstance(values, list) and isspmatrix(values[0]): 617 mat = list_sparse_to_sparse(values, dtype) 618 if transpose: 619 mat = mat.T 620 return mat 621 elif isinstance(values, dict): 622 mat = dict_to_sparse(values, dtype, shape) 623 if transpose: 624 mat = mat.T 625 return mat 626 elif isinstance(values, list) and isinstance(values[0], list): 627 if input_is_dense: 628 d = coo_matrix(values) 629 mat = coo_arrays_to_sparse((d.data, (d.row, d.col)), 630 dtype=dtype, shape=shape) 631 else: 632 mat = list_list_to_sparse(values, dtype, shape=shape) 633 return mat 634 elif isspmatrix(values): 635 mat = values 636 if transpose: 637 mat = mat.transpose() 638 return mat 639 else: 640 raise TableException("Unknown input type") 641 642 def _cast_metadata(self): 643 """Casts all metadata to defaultdict to support default values. 644 645 Should be called after any modifications to sample/observation 646 metadata. 647 """ 648 def cast_metadata(md): 649 """Do the actual casting""" 650 default_md = [] 651 # if we have a list of [None], set to None 652 if md is not None: 653 if md.count(None) == len(md): 654 return None 655 if md is not None: 656 for item in md: 657 d = defaultdict(lambda: None) 658 659 if isinstance(item, dict): 660 d.update(item) 661 elif item is None: 662 pass 663 else: 664 raise TableException("Unable to cast metadata: %s" % 665 repr(item)) 666 default_md.append(d) 667 return tuple(default_md) 668 return md 669 670 self._sample_metadata = cast_metadata(self._sample_metadata) 671 self._observation_metadata = cast_metadata(self._observation_metadata) 672 673 self._sample_group_metadata = ( 674 self._sample_group_metadata 675 if self._sample_group_metadata else None) 676 self._observation_group_metadata = ( 677 self._observation_group_metadata 678 if self._observation_group_metadata else None) 679 680 @property 681 def shape(self): 682 """The shape of the underlying contingency matrix""" 683 return self._data.shape 684 685 @property 686 def dtype(self): 687 """The type of the objects in the underlying contingency matrix""" 688 return self._data.dtype 689 690 @property 691 def nnz(self): 692 """Number of non-zero elements of the underlying contingency matrix""" 693 self._data.eliminate_zeros() 694 return self._data.nnz 695 696 @property 697 def matrix_data(self): 698 """The sparse matrix object""" 699 return self._data 700 701 def length(self, axis='sample'): 702 """Return the length of an axis 703 704 Parameters 705 ---------- 706 axis : {'sample', 'observation'}, optional 707 The axis to operate on 708 709 Raises 710 ------ 711 UnknownAxisError 712 If provided an unrecognized axis. 713 714 Examples 715 -------- 716 >>> from biom import example_table 717 >>> print(example_table.length(axis='sample')) 718 3 719 >>> print(example_table.length(axis='observation')) 720 2 721 """ 722 if axis not in ('sample', 'observation'): 723 raise UnknownAxisError(axis) 724 725 return self.shape[1] if axis == 'sample' else self.shape[0] 726 727 def add_group_metadata(self, group_md, axis='sample'): 728 """Take a dict of group metadata and add it to an axis 729 730 Parameters 731 ---------- 732 group_md : dict of tuples 733 `group_md` should be of the form ``{category: (data type, value)`` 734 axis : {'sample', 'observation'}, optional 735 The axis to operate on 736 737 Raises 738 ------ 739 UnknownAxisError 740 If provided an unrecognized axis. 741 """ 742 if axis == 'sample': 743 if self._sample_group_metadata is not None: 744 self._sample_group_metadata.update(group_md) 745 else: 746 self._sample_group_metadata = group_md 747 elif axis == 'observation': 748 if self._observation_group_metadata is not None: 749 self._observation_group_metadata.update(group_md) 750 else: 751 self._observation_group_metadata = group_md 752 else: 753 raise UnknownAxisError(axis) 754 755 def del_metadata(self, keys=None, axis='whole'): 756 """Remove metadata from an axis 757 758 Parameters 759 ---------- 760 keys : list of str, optional 761 The keys to remove from metadata. If None, all keys from the axis 762 are removed. 763 axis : {'sample', 'observation', 'whole'}, optional 764 The axis to operate on. If 'whole', the operation is applied to 765 both the sample and observation axes. 766 767 Raises 768 ------ 769 UnknownAxisError 770 If the requested axis does not exist. 771 772 Examples 773 -------- 774 >>> from biom import Table 775 >>> import numpy as np 776 >>> tab = Table(np.array([[1, 2], [3, 4]]), 777 ... ['O1', 'O2'], 778 ... ['S1', 'S2'], 779 ... sample_metadata=[{'barcode': 'ATGC', 'env': 'A'}, 780 ... {'barcode': 'GGTT', 'env': 'B'}]) 781 >>> tab.del_metadata(keys=['env']) 782 >>> for id, md in zip(tab.ids(), tab.metadata()): 783 ... print(id, list(md.items())) 784 S1 [('barcode', 'ATGC')] 785 S2 [('barcode', 'GGTT')] 786 """ 787 if axis == 'whole': 788 axes = ['sample', 'observation'] 789 elif axis in ('sample', 'observation'): 790 axes = [axis] 791 else: 792 raise UnknownAxisError("%s is not recognized" % axis) 793 794 if keys is None: 795 if axis == 'whole': 796 self._sample_metadata = None 797 self._observation_metadata = None 798 elif axis == 'sample': 799 self._sample_metadata = None 800 else: 801 self._observation_metadata = None 802 return 803 804 for ax in axes: 805 if self.metadata(axis=ax) is None: 806 continue 807 808 for i, md in zip(self.ids(axis=ax), self.metadata(axis=ax)): 809 for k in keys: 810 if k in md: 811 del md[k] 812 813 # for consistency with init on absence of metadata 814 empties = {True if not md else False 815 for md in self.metadata(axis=ax)} 816 if empties == {True, }: 817 if ax == 'sample': 818 self._sample_metadata = None 819 else: 820 self._observation_metadata = None 821 822 def add_metadata(self, md, axis='sample'): 823 """Take a dict of metadata and add it to an axis. 824 825 Parameters 826 ---------- 827 md : dict of dict 828 `md` should be of the form ``{id: {dict_of_metadata}}`` 829 axis : {'sample', 'observation'}, optional 830 The axis to operate on 831 """ 832 metadata = self.metadata(axis=axis) 833 if metadata is not None: 834 for id_, md_entry in viewitems(md): 835 if self.exists(id_, axis=axis): 836 idx = self.index(id_, axis=axis) 837 metadata[idx].update(md_entry) 838 else: 839 ids = self.ids(axis=axis) 840 if axis == 'sample': 841 self._sample_metadata = tuple( 842 [md[id_] if id_ in md else None for id_ in ids]) 843 elif axis == 'observation': 844 self._observation_metadata = tuple( 845 [md[id_] if id_ in md else None for id_ in ids]) 846 else: 847 raise UnknownAxisError(axis) 848 self._cast_metadata() 849 850 def __getitem__(self, args): 851 """Handles row or column slices 852 853 Slicing over an individual axis is supported, but slicing over both 854 axes at the same time is not supported. Partial slices, such as 855 `foo[0, 5:10]` are not supported, however full slices are supported, 856 such as `foo[0, :]`. 857 858 Parameters 859 ---------- 860 args : tuple or slice 861 The specific element (by index position) to return or an entire 862 row or column of the data. 863 864 Returns 865 ------- 866 float or spmatrix 867 A float is return if a specific element is specified, otherwise a 868 spmatrix object representing a vector of sparse data is returned. 869 870 Raises 871 ------ 872 IndexError 873 - If the matrix is empty 874 - If the arguments do not appear to be a tuple 875 - If a slice on row and column is specified 876 - If a partial slice is specified 877 878 Notes 879 ----- 880 Switching between slicing rows and columns is inefficient. Slicing of 881 rows requires a CSR representation, while slicing of columns requires a 882 CSC representation, and transforms are performed on the data if the 883 data are not in the required representation. These transforms can be 884 expensive if done frequently. 885 886 .. shownumpydoc 887 """ 888 if self.is_empty(): 889 raise IndexError("Cannot retrieve an element from an empty/null " 890 "table.") 891 892 try: 893 row, col = args 894 except: # noqa 895 raise IndexError("Must specify (row, col).") 896 897 if isinstance(row, slice) and isinstance(col, slice): 898 raise IndexError("Can only slice a single axis.") 899 900 if isinstance(row, slice): 901 if row.start is None and row.stop is None: 902 return self._get_col(col) 903 else: 904 raise IndexError("Can only handle full : slices per axis.") 905 elif isinstance(col, slice): 906 if col.start is None and col.stop is None: 907 return self._get_row(row) 908 else: 909 raise IndexError("Can only handle full : slices per axis.") 910 else: 911 if self._data.getformat() == 'coo': 912 self._data = self._data.tocsr() 913 914 return self._data[row, col] 915 916 def _get_row(self, row_idx): 917 """Return the row at ``row_idx``. 918 919 A row vector will be returned as a scipy.sparse matrix in csr format. 920 921 Notes 922 ----- 923 Switching between slicing rows and columns is inefficient. Slicing of 924 rows requires a CSR representation, while slicing of columns requires a 925 CSC representation, and transforms are performed on the data if the 926 data are not in the required representation. These transforms can be 927 expensive if done frequently. 928 929 """ 930 self._data = self._data.tocsr() 931 return self._data.getrow(row_idx) 932 933 def _get_col(self, col_idx): 934 """Return the column at ``col_idx``. 935 936 A column vector will be returned as a scipy.sparse matrix in csc 937 format. 938 939 Notes 940 ----- 941 Switching between slicing rows and columns is inefficient. Slicing of 942 rows requires a CSR representation, while slicing of columns requires a 943 CSC representation, and transforms are performed on the data if the 944 data are not in the required representation. These transforms can be 945 expensive if done frequently. 946 947 """ 948 self._data = self._data.tocsc() 949 return self._data.getcol(col_idx) 950 951 def reduce(self, f, axis): 952 """Reduce over axis using function `f` 953 954 Parameters 955 ---------- 956 f : function 957 The function to use for the reduce operation 958 axis : {'sample', 'observation'} 959 The axis on which to operate 960 961 Returns 962 ------- 963 numpy.array 964 A one-dimensional array representing the reduced rows 965 (observations) or columns (samples) of the data matrix 966 967 Raises 968 ------ 969 UnknownAxisError 970 If `axis` is neither "sample" nor "observation" 971 TableException 972 If the table's data matrix is empty 973 974 Examples 975 -------- 976 >>> import numpy as np 977 >>> from biom.table import Table 978 979 Create a 2x3 table 980 981 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 982 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'], 983 ... [{'foo': 'bar'}, {'x': 'y'}], None) 984 985 Create a reduce function 986 987 >>> func = lambda x, y: x + y 988 989 Reduce table on samples 990 991 >>> table.reduce(func, 'sample') # doctest: +NORMALIZE_WHITESPACE 992 array([ 1., 3., 43.]) 993 994 Reduce table on observations 995 996 >>> table.reduce(func, 'observation') # doctest: +NORMALIZE_WHITESPACE 997 array([ 1., 46.]) 998 """ 999 if self.is_empty(): 1000 raise TableException("Cannot reduce an empty table") 1001 1002 # np.apply_along_axis might reduce type conversions here and improve 1003 # speed. am opting for reduce right now as I think its more readable 1004 return asarray([reduce(f, v) for v in self.iter_data(axis=axis)]) 1005 1006 def sum(self, axis='whole'): 1007 """Returns the sum by axis 1008 1009 Parameters 1010 ---------- 1011 axis : {'whole', 'sample', 'observation'}, optional 1012 The axis on which to operate. 1013 1014 Returns 1015 ------- 1016 numpy.array or float 1017 If `axis` is "whole", returns an float representing the whole 1018 table sum. If `axis` is either "sample" or "observation", returns a 1019 numpy.array that holds a sum for each sample or observation, 1020 respectively. 1021 1022 Examples 1023 -------- 1024 >>> import numpy as np 1025 >>> from biom.table import Table 1026 1027 Create a 2x3 BIOM table: 1028 1029 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 1030 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) 1031 1032 Add all values in the table: 1033 1034 >>> table.sum() 1035 array(47.0) 1036 1037 Add all values per sample: 1038 1039 >>> table.sum(axis='sample') # doctest: +NORMALIZE_WHITESPACE 1040 array([ 1., 3., 43.]) 1041 1042 Add all values per observation: 1043 1044 >>> table.sum(axis='observation') # doctest: +NORMALIZE_WHITESPACE 1045 array([ 1., 46.]) 1046 """ 1047 if axis == 'whole': 1048 axis = None 1049 elif axis == 'sample': 1050 axis = 0 1051 elif axis == 'observation': 1052 axis = 1 1053 else: 1054 raise UnknownAxisError(axis) 1055 1056 matrix_sum = np.squeeze(np.asarray(self._data.sum(axis=axis))) 1057 1058 # We only want to return a scalar if the whole matrix was summed. 1059 if axis is not None and matrix_sum.shape == (): 1060 matrix_sum = matrix_sum.reshape(1) 1061 1062 return matrix_sum 1063 1064 def transpose(self): 1065 """Transpose the contingency table 1066 1067 The returned table will be an entirely new table, including copies of 1068 the (transposed) data, sample/observation IDs and metadata. 1069 1070 Returns 1071 ------- 1072 Table 1073 Return a new table that is the transpose of caller table. 1074 """ 1075 sample_md_copy = deepcopy(self.metadata()) 1076 obs_md_copy = deepcopy(self.metadata(axis='observation')) 1077 1078 if self._data.getformat() == 'lil': 1079 # lil's transpose method doesn't have the copy kwarg, but all of 1080 # the others do. 1081 self._data = self._data.tocsr() 1082 1083 # sample ids and observations are reversed becuase we trasposed 1084 return self.__class__(self._data.transpose(copy=True), 1085 self.ids()[:], self.ids(axis='observation')[:], 1086 sample_md_copy, obs_md_copy, self.table_id) 1087 1088 def head(self, n=5, m=5): 1089 """Get the first n rows and m columns from self 1090 1091 Parameters 1092 ---------- 1093 n : int, optional 1094 The number of rows (observations) to get. This number must be 1095 greater than 0. If not specified, 5 rows will be retrieved. 1096 1097 m : int, optional 1098 The number of columns (samples) to get. This number must be 1099 greater than 0. If not specified, 5 columns will be 1100 retrieved. 1101 1102 Notes 1103 ----- 1104 Like `head` for Linux like systems, requesting more rows (or columns) 1105 than exists will silently work. 1106 1107 Raises 1108 ------ 1109 IndexError 1110 If `n` or `m` are <= 0. 1111 1112 Returns 1113 ------- 1114 Table 1115 The subset table. 1116 1117 Examples 1118 -------- 1119 >>> import numpy as np 1120 >>> from biom.table import Table 1121 >>> data = np.arange(100).reshape(5, 20) 1122 >>> obs_ids = ['O%d' % i for i in range(1, 6)] 1123 >>> samp_ids = ['S%d' % i for i in range(1, 21)] 1124 >>> table = Table(data, obs_ids, samp_ids) 1125 >>> print(table.head()) # doctest: +NORMALIZE_WHITESPACE 1126 # Constructed from biom file 1127 #OTU ID S1 S2 S3 S4 S5 1128 O1 0.0 1.0 2.0 3.0 4.0 1129 O2 20.0 21.0 22.0 23.0 24.0 1130 O3 40.0 41.0 42.0 43.0 44.0 1131 O4 60.0 61.0 62.0 63.0 64.0 1132 O5 80.0 81.0 82.0 83.0 84.0 1133 1134 """ 1135 if n <= 0: 1136 raise IndexError("n cannot be <= 0.") 1137 1138 if m <= 0: 1139 raise IndexError("m cannot be <= 0.") 1140 1141 row_ids = self.ids(axis='observation')[:n] 1142 col_ids = self.ids(axis='sample')[:m] 1143 1144 table = self.filter(row_ids, axis='observation', inplace=False) 1145 return table.filter(col_ids, axis='sample') 1146 1147 def group_metadata(self, axis='sample'): 1148 """Return the group metadata of the given axis 1149 1150 Parameters 1151 ---------- 1152 axis : {'sample', 'observation'}, optional 1153 Axis to search for the group metadata. Defaults to 'sample' 1154 1155 Returns 1156 ------- 1157 dict 1158 The corresponding group metadata for the given axis 1159 1160 Raises 1161 ------ 1162 UnknownAxisError 1163 If provided an unrecognized axis. 1164 1165 Examples 1166 -------- 1167 >>> import numpy as np 1168 >>> from biom.table import Table 1169 1170 Create a 2x3 BIOM table, with group observation metadata and no group 1171 sample metadata: 1172 1173 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 1174 >>> group_observation_md = {'tree': ('newick', '(O1:0.3,O2:0.4);')} 1175 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'], 1176 ... observation_group_metadata=group_observation_md) 1177 1178 Get the observation group metadata: 1179 1180 >>> table.group_metadata(axis='observation') 1181 {'tree': ('newick', '(O1:0.3,O2:0.4);')} 1182 1183 Get the sample group metadata: 1184 1185 >> table.group_metadata() 1186 None 1187 """ 1188 if axis == 'sample': 1189 return self._sample_group_metadata 1190 elif axis == 'observation': 1191 return self._observation_group_metadata 1192 else: 1193 raise UnknownAxisError(axis) 1194 1195 def ids(self, axis='sample'): 1196 """Return the ids along the given axis 1197 1198 Parameters 1199 ---------- 1200 axis : {'sample', 'observation'}, optional 1201 Axis to return ids from. Defaults to 'sample' 1202 1203 Returns 1204 ------- 1205 1-D numpy array 1206 The ids along the given axis 1207 1208 Raises 1209 ------ 1210 UnknownAxisError 1211 If provided an unrecognized axis. 1212 1213 Examples 1214 -------- 1215 >>> import numpy as np 1216 >>> from biom.table import Table 1217 1218 Create a 2x3 BIOM table: 1219 1220 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 1221 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) 1222 1223 Get the ids along the observation axis: 1224 1225 >>> print(table.ids(axis='observation')) 1226 ['O1' 'O2'] 1227 1228 Get the ids along the sample axis: 1229 1230 >>> print(table.ids()) 1231 ['S1' 'S2' 'S3'] 1232 """ 1233 if axis == 'sample': 1234 return self._sample_ids 1235 elif axis == 'observation': 1236 return self._observation_ids 1237 else: 1238 raise UnknownAxisError(axis) 1239 1240 def update_ids(self, id_map, axis='sample', strict=True, inplace=True): 1241 """Update the ids along the given axis 1242 1243 Parameters 1244 ---------- 1245 id_map : dict 1246 Mapping of old to new ids 1247 axis : {'sample', 'observation'}, optional 1248 Axis to search for `id`. Defaults to 'sample' 1249 strict : bool, optional 1250 If ``True``, raise an error if an id is present in the given axis 1251 but is not a key in ``id_map``. If False, retain old identifier 1252 for ids that are present in the given axis but are not keys in 1253 ``id_map``. 1254 inplace : bool, optional 1255 If ``True`` the ids are updated in ``self``; if ``False`` the ids 1256 are updated in a new table is returned. 1257 1258 Returns 1259 ------- 1260 Table 1261 Table object where ids have been updated. 1262 1263 Raises 1264 ------ 1265 UnknownAxisError 1266 If provided an unrecognized axis. 1267 TableException 1268 If an id from ``self`` is not in ``id_map`` and ``strict`` is 1269 ``True``. 1270 1271 Examples 1272 -------- 1273 Create a 2x3 BIOM table: 1274 1275 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 1276 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) 1277 1278 Define a mapping of old to new sample ids: 1279 1280 >>> id_map = {'S1':'s1.1', 'S2':'s2.2', 'S3':'s3.3'} 1281 1282 Get the ids along the sample axis in the table: 1283 1284 >>> print(table.ids(axis='sample')) 1285 ['S1' 'S2' 'S3'] 1286 1287 Update the sample ids and get the ids along the sample axis in the 1288 updated table: 1289 1290 >>> updated_table = table.update_ids(id_map, axis='sample') 1291 >>> print(updated_table.ids(axis='sample')) 1292 ['s1.1' 's2.2' 's3.3'] 1293 """ 1294 updated_ids = zeros(self.ids(axis=axis).size, dtype=object) 1295 for idx, old_id in enumerate(self.ids(axis=axis)): 1296 if strict and old_id not in id_map: 1297 raise TableException( 1298 "Mapping not provided for %s identifier: %s. If this " 1299 "identifier should not be updated, pass strict=False." 1300 % (axis, old_id)) 1301 1302 updated_ids[idx] = id_map.get(old_id, old_id) 1303 1304 # prepare the result object and update the ids along the specified 1305 # axis 1306 result = self if inplace else self.copy() 1307 if axis == 'sample': 1308 result._sample_ids = updated_ids 1309 else: 1310 result._observation_ids = updated_ids 1311 1312 result._index_ids() 1313 1314 # check for errors (specifically, we want to esnsure that duplicate 1315 # ids haven't been introduced) 1316 errcheck(result) 1317 1318 return result 1319 1320 def _get_sparse_data(self, axis='sample'): 1321 """Returns the internal data in the correct sparse representation 1322 1323 Parameters 1324 ---------- 1325 axis : {'sample', 'observation'}, optional 1326 Axis to search for `id`. Defaults to 'sample' 1327 1328 Returns 1329 ------- 1330 sparse matrix 1331 The data in csc (axis='sample') or csr (axis='observation') 1332 representation 1333 """ 1334 if axis == 'sample': 1335 return self._data.tocsc() 1336 elif axis == 'observation': 1337 return self._data.tocsr() 1338 else: 1339 raise UnknownAxisError(axis) 1340 1341 def metadata(self, id=None, axis='sample'): 1342 """Return the metadata of the identified sample/observation. 1343 1344 Parameters 1345 ---------- 1346 id : str 1347 ID of the sample or observation whose index will be returned. 1348 axis : {'sample', 'observation'} 1349 Axis to search for `id`. 1350 1351 Returns 1352 ------- 1353 defaultdict or None 1354 The corresponding metadata ``defaultdict`` or ``None`` of that axis 1355 does not have metadata. 1356 1357 Raises 1358 ------ 1359 UnknownAxisError 1360 If provided an unrecognized axis. 1361 UnknownIDError 1362 If provided an unrecognized sample/observation ID. 1363 1364 Examples 1365 -------- 1366 >>> import numpy as np 1367 >>> from biom.table import Table 1368 1369 Create a 2x3 BIOM table, with observation metadata and no sample 1370 metadata: 1371 1372 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 1373 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'], 1374 ... [{'foo': 'bar'}, {'x': 'y'}], None) 1375 1376 Get the metadata of the observation with ID "O2": 1377 1378 >>> # casting to `dict` as the return is `defaultdict` 1379 >>> dict(table.metadata('O2', 'observation')) 1380 {'x': 'y'} 1381 1382 Get the metadata of the sample with ID "S1": 1383 1384 >>> table.metadata('S1', 'sample') is None 1385 True 1386 """ 1387 if axis == 'sample': 1388 md = self._sample_metadata 1389 elif axis == 'observation': 1390 md = self._observation_metadata 1391 else: 1392 raise UnknownAxisError(axis) 1393 1394 if id is None: 1395 return md 1396 1397 idx = self.index(id, axis=axis) 1398 1399 return md[idx] if md is not None else None 1400 1401 def index(self, id, axis): 1402 """Return the index of the identified sample/observation. 1403 1404 Parameters 1405 ---------- 1406 id : str 1407 ID of the sample or observation whose index will be returned. 1408 axis : {'sample', 'observation'} 1409 Axis to search for `id`. 1410 1411 Returns 1412 ------- 1413 int 1414 Index of the sample/observation identified by `id`. 1415 1416 Raises 1417 ------ 1418 UnknownAxisError 1419 If provided an unrecognized axis. 1420 UnknownIDError 1421 If provided an unrecognized sample/observation ID. 1422 1423 Examples 1424 -------- 1425 >>> import numpy as np 1426 >>> from biom.table import Table 1427 1428 Create a 2x3 BIOM table: 1429 1430 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 1431 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) 1432 1433 Get the index of the observation with ID "O2": 1434 1435 >>> table.index('O2', 'observation') 1436 1 1437 1438 Get the index of the sample with ID "S1": 1439 1440 >>> table.index('S1', 'sample') 1441 0 1442 """ 1443 idx_lookup = self._index(axis=axis) 1444 1445 if id not in idx_lookup: 1446 raise UnknownIDError(id, axis) 1447 1448 return idx_lookup[id] 1449 1450 def get_value_by_ids(self, obs_id, samp_id): 1451 """Return value in the matrix corresponding to ``(obs_id, samp_id)`` 1452 1453 Parameters 1454 ---------- 1455 obs_id : str 1456 The ID of the observation 1457 samp_id : str 1458 The ID of the sample 1459 1460 Returns 1461 ------- 1462 float 1463 The data value corresponding to the specified matrix position 1464 1465 Examples 1466 -------- 1467 >>> import numpy as np 1468 >>> from biom.table import Table 1469 1470 Create a 2x3 BIOM table: 1471 1472 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 1473 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'Z3']) 1474 1475 Retrieve the number of counts for observation `O1` in sample `Z3`. 1476 1477 >>> print(table.get_value_by_ids('O2', 'Z3')) 1478 42.0 1479 1480 See Also 1481 -------- 1482 Table.data 1483 """ 1484 return self[self.index(obs_id, 'observation'), 1485 self.index(samp_id, 'sample')] 1486 1487 def __str__(self): 1488 """Stringify self 1489 1490 Default str output for a Table is just row/col ids and data values 1491 """ 1492 return self.delimited_self() 1493 1494 def __repr__(self): 1495 """Returns a high-level summary of the table's properties 1496 1497 Returns 1498 ------- 1499 str 1500 A string detailing the shape, class, number of nonzero entries, and 1501 table density 1502 """ 1503 rows, cols = self.shape 1504 return '%d x %d %s with %d nonzero entries (%d%% dense)' % ( 1505 rows, cols, repr(self.__class__), self.nnz, 1506 self.get_table_density() * 100 1507 ) 1508 1509 def exists(self, id, axis="sample"): 1510 """Returns whether id exists in axis 1511 1512 Parameters 1513 ---------- 1514 id: str 1515 id to check if exists 1516 axis : {'sample', 'observation'}, optional 1517 The axis to check 1518 1519 Returns 1520 ------- 1521 bool 1522 ``True`` if `id` exists, ``False`` otherwise 1523 1524 Examples 1525 -------- 1526 >>> import numpy as np 1527 >>> from biom.table import Table 1528 1529 Create a 2x3 BIOM table: 1530 1531 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 1532 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) 1533 1534 Check whether sample ID is in the table: 1535 1536 >>> table.exists('S1') 1537 True 1538 >>> table.exists('S4') 1539 False 1540 1541 Check whether an observation ID is in the table: 1542 1543 >>> table.exists('O1', 'observation') 1544 True 1545 >>> table.exists('O3', 'observation') 1546 False 1547 """ 1548 return id in self._index(axis=axis) 1549 1550 def delimited_self(self, delim=u'\t', header_key=None, header_value=None, 1551 metadata_formatter=str, 1552 observation_column_name=u'#OTU ID', direct_io=None): 1553 """Return self as a string in a delimited form 1554 1555 Default str output for the Table is just row/col ids and table data 1556 without any metadata 1557 1558 Including observation metadata in output: If ``header_key`` is not 1559 ``None``, the observation metadata with that name will be included 1560 in the delimited output. If ``header_value`` is also not ``None``, the 1561 observation metadata will use the provided ``header_value`` as the 1562 observation metadata name (i.e., the column header) in the delimited 1563 output. 1564 1565 ``metadata_formatter``: a function which takes a metadata entry and 1566 returns a formatted version that should be written to file 1567 1568 ``observation_column_name``: the name of the first column in the output 1569 table, corresponding to the observation IDs. For example, the default 1570 will look something like: 1571 1572 #OTU ID\tSample1\tSample2 1573 OTU1\t10\t2 1574 OTU2\t4\t8 1575 """ 1576 def to_utf8(i): 1577 if isinstance(i, bytes): 1578 return i.decode('utf8') 1579 else: 1580 return str(i) 1581 1582 if self.is_empty(): 1583 raise TableException("Cannot delimit self if I don't have data...") 1584 1585 samp_ids = delim.join([to_utf8(i) for i in self.ids()]) 1586 # 17 hrs of straight programming later... 1587 if header_key is not None: 1588 if header_value is None: 1589 raise TableException( 1590 "You need to specify both header_key and header_value") 1591 1592 if header_value is not None: 1593 if header_key is None: 1594 raise TableException( 1595 "You need to specify both header_key and header_value") 1596 1597 if header_value: 1598 output = [u'# Constructed from biom file', 1599 u'%s%s%s\t%s' % (observation_column_name, delim, 1600 samp_ids, header_value)] 1601 else: 1602 output = ['# Constructed from biom file', 1603 '%s%s%s' % (observation_column_name, delim, samp_ids)] 1604 1605 if direct_io is not None: 1606 direct_io.writelines([i+"\n" for i in output]) 1607 1608 obs_metadata = self.metadata(axis='observation') 1609 iterable = self.ids(axis='observation') 1610 end_line = '' if direct_io is None else '\n' 1611 1612 for obs_id, obs_values in zip(iterable, 1613 self._iter_obs()): 1614 str_obs_vals = delim.join(map(str, self._to_dense(obs_values))) 1615 obs_id = to_utf8(obs_id) 1616 if header_key and obs_metadata is not None: 1617 md = obs_metadata[self._obs_index[obs_id]] 1618 md_out = metadata_formatter(md.get(header_key, None)) 1619 output_row = u'%s%s%s\t%s%s' % \ 1620 (obs_id, delim, str_obs_vals, md_out, end_line) 1621 1622 if direct_io is None: 1623 output.append(output_row) 1624 else: 1625 direct_io.write(output_row) 1626 else: 1627 output_row = u'%s%s%s%s' % \ 1628 (obs_id, delim, str_obs_vals, end_line) 1629 if direct_io is None: 1630 output.append(output_row) 1631 else: 1632 direct_io.write((output_row)) 1633 1634 return '\n'.join(output) 1635 1636 def is_empty(self): 1637 """Check whether the table is empty 1638 1639 Returns 1640 ------- 1641 bool 1642 ``True`` if the table is empty, ``False`` otherwise 1643 """ 1644 if not self.ids().size or not self.ids(axis='observation').size: 1645 return True 1646 else: 1647 return False 1648 1649 def __iter__(self): 1650 """See ``biom.table.Table.iter``""" 1651 return self.iter() 1652 1653 def _iter_samp(self): 1654 """Return sample vectors of data matrix vectors""" 1655 for c in range(self.shape[1]): 1656 # this pulls out col vectors but need to convert to the expected 1657 # row vector 1658 colvec = self._get_col(c) 1659 yield colvec.transpose(copy=True) 1660 1661 def _iter_obs(self): 1662 """Return observation vectors of data matrix""" 1663 for r in range(self.shape[0]): 1664 yield self._get_row(r) 1665 1666 def get_table_density(self): 1667 """Returns the fraction of nonzero elements in the table. 1668 1669 Returns 1670 ------- 1671 float 1672 The fraction of nonzero elements in the table 1673 """ 1674 density = 0.0 1675 1676 if not self.is_empty(): 1677 density = (self.nnz / 1678 (len(self.ids()) * len(self.ids(axis='observation')))) 1679 1680 return density 1681 1682 def descriptive_equality(self, other): 1683 """For use in testing, describe how the tables are not equal""" 1684 if not isinstance(other, self.__class__): 1685 return "Tables are not of comparable classes" 1686 if not self.type == other.type: 1687 return "Tables are not the same type" 1688 if not np.array_equal(self.ids(axis='observation'), 1689 other.ids(axis='observation')): 1690 return "Observation IDs are not the same" 1691 if not np.array_equal(self.ids(), other.ids()): 1692 return "Sample IDs are not the same" 1693 if not np.array_equal(self.metadata(axis='observation'), 1694 other.metadata(axis='observation')): 1695 return "Observation metadata are not the same" 1696 if not np.array_equal(self.metadata(), other.metadata()): 1697 return "Sample metadata are not the same" 1698 if not self._data_equality(other._data): 1699 return "Data elements are not the same" 1700 1701 return "Tables appear equal" 1702 1703 def __eq__(self, other): 1704 """Equality is determined by the data matrix, metadata, and IDs""" 1705 if not isinstance(other, self.__class__): 1706 return False 1707 if self.type != other.type: 1708 return False 1709 if not np.array_equal(self.ids(axis='observation'), 1710 other.ids(axis='observation')): 1711 return False 1712 if not np.array_equal(self.ids(), other.ids()): 1713 return False 1714 if not np.array_equal(self.metadata(axis='observation'), 1715 other.metadata(axis='observation')): 1716 return False 1717 if not np.array_equal(self.metadata(), other.metadata()): 1718 return False 1719 if not self._data_equality(other._data): 1720 return False 1721 1722 return True 1723 1724 def _data_equality(self, other): 1725 """Return ``True`` if both matrices are equal. 1726 1727 Matrices are equal iff the following items are equal: 1728 - shape 1729 - dtype 1730 - size (nnz) 1731 - matrix data (more expensive, so checked last) 1732 1733 The sparse format does not need to be the same between the two 1734 matrices. ``self`` and ``other`` will be converted to csr format if 1735 necessary before performing the final comparison. 1736 1737 """ 1738 if self._data.shape != other.shape: 1739 return False 1740 1741 if self._data.dtype != other.dtype: 1742 return False 1743 1744 if self._data.nnz != other.nnz: 1745 return False 1746 1747 self._data = self._data.tocsr() 1748 other = other.tocsr() 1749 1750 if (self._data != other).nnz > 0: 1751 return False 1752 1753 return True 1754 1755 def __ne__(self, other): 1756 return not (self == other) 1757 1758 def data(self, id, axis='sample', dense=True): 1759 """Returns data associated with an `id` 1760 1761 Parameters 1762 ---------- 1763 id : str 1764 ID of the sample or observation whose data will be returned. 1765 axis : {'sample', 'observation'} 1766 Axis to search for `id`. 1767 dense : bool, optional 1768 If ``True``, return data as dense 1769 1770 Returns 1771 ------- 1772 np.ndarray or scipy.sparse.spmatrix 1773 np.ndarray if ``dense``, otherwise scipy.sparse.spmatrix 1774 1775 Raises 1776 ------ 1777 UnknownAxisError 1778 If provided an unrecognized axis. 1779 1780 Examples 1781 -------- 1782 >>> from biom import example_table 1783 >>> example_table.data('S1', axis='sample') 1784 array([ 0., 3.]) 1785 1786 See Also 1787 -------- 1788 Table.get_value_by_ids 1789 1790 """ 1791 if axis == 'sample': 1792 data = self[:, self.index(id, 'sample')] 1793 elif axis == 'observation': 1794 data = self[self.index(id, 'observation'), :] 1795 else: 1796 raise UnknownAxisError(axis) 1797 1798 if dense: 1799 return self._to_dense(data) 1800 else: 1801 return data 1802 1803 def copy(self): 1804 """Returns a copy of the table""" 1805 return self.__class__(self._data.copy(), 1806 self.ids(axis='observation').copy(), 1807 self.ids().copy(), 1808 deepcopy(self.metadata(axis='observation')), 1809 deepcopy(self.metadata()), 1810 self.table_id, 1811 type=self.type) 1812 1813 def iter_data(self, dense=True, axis='sample'): 1814 """Yields axis values 1815 1816 Parameters 1817 ---------- 1818 dense : bool, optional 1819 Defaults to ``True``. If ``False``, yield compressed sparse row or 1820 compressed sparse columns if `axis` is 'observation' or 'sample', 1821 respectively. 1822 axis : {'sample', 'observation'}, optional 1823 Axis to iterate over. 1824 1825 Returns 1826 ------- 1827 generator 1828 Yields list of values for each value in `axis` 1829 1830 Raises 1831 ------ 1832 UnknownAxisError 1833 If axis other than 'sample' or 'observation' passed 1834 1835 Examples 1836 -------- 1837 >>> import numpy as np 1838 >>> from biom.table import Table 1839 >>> data = np.arange(30).reshape(3,10) # 3 X 10 OTU X Sample table 1840 >>> obs_ids = ['o1', 'o2', 'o3'] 1841 >>> sam_ids = ['s%i' %i for i in range(1,11)] 1842 >>> bt = Table(data, observation_ids=obs_ids, sample_ids=sam_ids) 1843 1844 Lets find the sample with the largest sum 1845 1846 >>> sample_gen = bt.iter_data(axis='sample') 1847 >>> max_sample_count = max([sample.sum() for sample in sample_gen]) 1848 >>> print(max_sample_count) 1849 57.0 1850 """ 1851 if axis == "sample": 1852 for samp_v in self._iter_samp(): 1853 if dense: 1854 yield self._to_dense(samp_v) 1855 else: 1856 yield samp_v 1857 elif axis == "observation": 1858 for obs_v in self._iter_obs(): 1859 if dense: 1860 yield self._to_dense(obs_v) 1861 else: 1862 yield obs_v 1863 else: 1864 raise UnknownAxisError(axis) 1865 1866 def iter(self, dense=True, axis='sample'): 1867 """Yields ``(value, id, metadata)`` 1868 1869 1870 Parameters 1871 ---------- 1872 dense : bool, optional 1873 Defaults to ``True``. If ``False``, yield compressed sparse row or 1874 compressed sparse columns if `axis` is 'observation' or 'sample', 1875 respectively. 1876 axis : {'sample', 'observation'}, optional 1877 The axis to iterate over. 1878 1879 Returns 1880 ------- 1881 GeneratorType 1882 A generator that yields (values, id, metadata) 1883 1884 Examples 1885 -------- 1886 >>> import numpy as np 1887 >>> from biom.table import Table 1888 1889 Create a 2x3 BIOM table: 1890 1891 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 1892 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'Z3']) 1893 1894 Iter over samples and keep those that start with an Z: 1895 1896 >>> [(values, id, metadata) 1897 ... for values, id, metadata in table.iter() if id[0]=='Z'] 1898 [(array([ 1., 42.]), 'Z3', None)] 1899 1900 Iter over observations and add the 2nd column of the values 1901 1902 >>> col = [values[1] for values, id, metadata in table.iter()] 1903 >>> sum(col) 1904 46.0 1905 """ 1906 ids = self.ids(axis=axis) 1907 metadata = self.metadata(axis=axis) 1908 if axis == 'sample': 1909 iter_ = self._iter_samp() 1910 elif axis == 'observation': 1911 iter_ = self._iter_obs() 1912 else: 1913 raise UnknownAxisError(axis) 1914 1915 if metadata is None: 1916 metadata = (None,) * len(ids) 1917 1918 iter_ = self.iter_data(axis=axis, dense=dense) 1919 1920 return zip(iter_, ids, metadata) 1921 1922 def iter_pairwise(self, dense=True, axis='sample', tri=True, diag=False): 1923 """Pairwise iteration over self 1924 1925 Parameters 1926 ---------- 1927 dense : bool, optional 1928 Defaults to ``True``. If ``False``, yield compressed sparse row or 1929 compressed sparse columns if `axis` is 'observation' or 'sample', 1930 respectively. 1931 axis : {'sample', 'observation'}, optional 1932 The axis to iterate over. 1933 tri : bool, optional 1934 If ``True``, just yield [i, j] and not [j, i] 1935 diag : bool, optional 1936 If ``True``, yield [i, i] 1937 1938 Returns 1939 ------- 1940 GeneratorType 1941 Yields [(val_i, id_i, metadata_i), (val_j, id_j, metadata_j)] 1942 1943 Raises 1944 ------ 1945 UnknownAxisError 1946 1947 Examples 1948 -------- 1949 >>> from biom import example_table 1950 1951 By default, only the upper triangle without the diagonal of the 1952 resulting pairwise combinations is yielded. 1953 1954 >>> iter_ = example_table.iter_pairwise() 1955 >>> for (val_i, id_i, md_i), (val_j, id_j, md_j) in iter_: 1956 ... print(id_i, id_j) 1957 S1 S2 1958 S1 S3 1959 S2 S3 1960 1961 The full pairwise combinations can also be yielded though. 1962 1963 >>> iter_ = example_table.iter_pairwise(tri=False, diag=True) 1964 >>> for (val_i, id_i, md_i), (val_j, id_j, md_j) in iter_: 1965 ... print(id_i, id_j) 1966 S1 S1 1967 S1 S2 1968 S1 S3 1969 S2 S1 1970 S2 S2 1971 S2 S3 1972 S3 S1 1973 S3 S2 1974 S3 S3 1975 1976 """ 1977 metadata = self.metadata(axis=axis) 1978 ids = self.ids(axis=axis) 1979 1980 if metadata is None: 1981 metadata = (None,) * len(ids) 1982 1983 ind = np.arange(len(ids)) 1984 diag_v = 1 - diag # for offseting tri_f, where a 0 includes the diag 1985 1986 if tri: 1987 def tri_f(idx): 1988 return ind[idx+diag_v:] 1989 else: 1990 def tri_f(idx): 1991 return np.hstack([ind[:idx], ind[idx+diag_v:]]) 1992 1993 for idx, i in enumerate(ind): 1994 id_i = ids[i] 1995 md_i = metadata[i] 1996 data_i = self.data(id_i, axis=axis, dense=dense) 1997 1998 for j in tri_f(idx): 1999 id_j = ids[j] 2000 md_j = metadata[j] 2001 data_j = self.data(id_j, axis=axis, dense=dense) 2002 2003 yield ((data_i, id_i, md_i), (data_j, id_j, md_j)) 2004 2005 def sort_order(self, order, axis='sample'): 2006 """Return a new table with `axis` in `order` 2007 2008 Parameters 2009 ---------- 2010 order : iterable 2011 The desired order for axis 2012 axis : {'sample', 'observation'}, optional 2013 The axis to operate on 2014 2015 Returns 2016 ------- 2017 Table 2018 A table where the observations or samples are sorted according to 2019 `order` 2020 2021 Examples 2022 -------- 2023 2024 >>> import numpy as np 2025 >>> from biom.table import Table 2026 2027 Create a 2x3 BIOM table: 2028 2029 >>> data = np.asarray([[1, 0, 4], [1, 3, 0]]) 2030 >>> table = Table(data, ['O2', 'O1'], ['S2', 'S1', 'S3']) 2031 >>> print(table) # doctest: +NORMALIZE_WHITESPACE 2032 # Constructed from biom file 2033 #OTU ID S2 S1 S3 2034 O2 1.0 0.0 4.0 2035 O1 1.0 3.0 0.0 2036 2037 Sort the table using a list of samples: 2038 2039 >>> sorted_table = table.sort_order(['S2', 'S3', 'S1']) 2040 >>> print(sorted_table) # doctest: +NORMALIZE_WHITESPACE 2041 # Constructed from biom file 2042 #OTU ID S2 S3 S1 2043 O2 1.0 4.0 0.0 2044 O1 1.0 0.0 3.0 2045 2046 2047 Additionally you could sort the table's observations: 2048 2049 >>> sorted_table = table.sort_order(['O1', 'O2'], axis="observation") 2050 >>> print(sorted_table) # doctest: +NORMALIZE_WHITESPACE 2051 # Constructed from biom file 2052 #OTU ID S2 S1 S3 2053 O1 1.0 3.0 0.0 2054 O2 1.0 0.0 4.0 2055 2056 """ 2057 fancy = np.array([self.index(i, axis=axis) for i in order], dtype=int) 2058 metadata = self.metadata(axis=axis) 2059 if metadata is not None: 2060 metadata = np.array(metadata)[fancy] 2061 2062 if axis == 'sample': 2063 mat = self.matrix_data[:, fancy] 2064 return self.__class__(mat, 2065 self.ids(axis='observation')[:], order[:], 2066 self.metadata(axis='observation'), metadata, 2067 self.table_id, self.type) 2068 2069 elif axis == 'observation': 2070 mat = self.matrix_data[fancy, :] 2071 return self.__class__(mat, 2072 order[:], self.ids()[:], 2073 metadata, self.metadata(), self.table_id, 2074 self.type) 2075 else: 2076 raise UnknownAxisError(axis) 2077 2078 def sort(self, sort_f=natsort, axis='sample'): 2079 """Return a table sorted along axis 2080 2081 Parameters 2082 ---------- 2083 sort_f : function, optional 2084 Defaults to ``biom.util.natsort``. A function that takes a list of 2085 values and sorts it 2086 axis : {'sample', 'observation'}, optional 2087 The axis to operate on 2088 2089 Returns 2090 ------- 2091 biom.Table 2092 A table whose samples or observations are sorted according to the 2093 `sort_f` function 2094 2095 Examples 2096 -------- 2097 >>> import numpy as np 2098 >>> from biom.table import Table 2099 2100 Create a 2x3 BIOM table: 2101 2102 >>> data = np.asarray([[1, 0, 4], [1, 3, 0]]) 2103 >>> table = Table(data, ['O2', 'O1'], ['S2', 'S1', 'S3']) 2104 >>> print(table) # doctest: +NORMALIZE_WHITESPACE 2105 # Constructed from biom file 2106 #OTU ID S2 S1 S3 2107 O2 1.0 0.0 4.0 2108 O1 1.0 3.0 0.0 2109 2110 Sort the order of samples in the table using the default natural 2111 sorting: 2112 2113 >>> new_table = table.sort() 2114 >>> print(new_table) # doctest: +NORMALIZE_WHITESPACE 2115 # Constructed from biom file 2116 #OTU ID S1 S2 S3 2117 O2 0.0 1.0 4.0 2118 O1 3.0 1.0 0.0 2119 2120 Sort the order of observations in the table using the default natural 2121 sorting: 2122 2123 >>> new_table = table.sort(axis='observation') 2124 >>> print(new_table) # doctest: +NORMALIZE_WHITESPACE 2125 # Constructed from biom file 2126 #OTU ID S2 S1 S3 2127 O1 1.0 3.0 0.0 2128 O2 1.0 0.0 4.0 2129 2130 Sort the samples in reverse order using a custom sort function: 2131 2132 >>> sort_f = lambda x: list(sorted(x, reverse=True)) 2133 >>> new_table = table.sort(sort_f=sort_f) 2134 >>> print(new_table) # doctest: +NORMALIZE_WHITESPACE 2135 # Constructed from biom file 2136 #OTU ID S3 S2 S1 2137 O2 4.0 1.0 0.0 2138 O1 0.0 1.0 3.0 2139 """ 2140 return self.sort_order(sort_f(self.ids(axis=axis)), axis=axis) 2141 2142 def filter(self, ids_to_keep, axis='sample', invert=False, inplace=True): 2143 """Filter a table based on a function or iterable. 2144 2145 Parameters 2146 ---------- 2147 ids_to_keep : iterable, or function(values, id, metadata) -> bool 2148 If a function, it will be called with the values of the 2149 sample/observation, its id (a string) and the dictionary 2150 of metadata of each sample/observation, and must return a 2151 boolean. If it's an iterable, it must be a list of ids to 2152 keep. 2153 axis : {'sample', 'observation'}, optional 2154 It controls whether to filter samples or observations and 2155 defaults to "sample". 2156 invert : bool, optional 2157 Defaults to ``False``. If set to ``True``, discard samples or 2158 observations where `ids_to_keep` returns True 2159 inplace : bool, optional 2160 Defaults to ``True``. Whether to return a new table or modify 2161 itself. 2162 2163 Returns 2164 ------- 2165 biom.Table 2166 Returns itself if `inplace`, else returns a new filtered table. 2167 2168 Raises 2169 ------ 2170 UnknownAxisError 2171 If provided an unrecognized axis. 2172 2173 Examples 2174 -------- 2175 >>> import numpy as np 2176 >>> from biom.table import Table 2177 2178 Create a 2x3 BIOM table, with observation metadata and sample 2179 metadata: 2180 2181 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 2182 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'], 2183 ... [{'full_genome_available': True}, 2184 ... {'full_genome_available': False}], 2185 ... [{'sample_type': 'a'}, {'sample_type': 'a'}, 2186 ... {'sample_type': 'b'}]) 2187 2188 Define a function to keep only samples with sample_type == 'a'. This 2189 will drop sample S3, which has sample_type 'b': 2190 2191 >>> filter_fn = lambda val, id_, md: md['sample_type'] == 'a' 2192 2193 Get a filtered version of the table, leaving the original table 2194 untouched: 2195 2196 >>> new_table = table.filter(filter_fn, inplace=False) 2197 >>> print(table.ids()) 2198 ['S1' 'S2' 'S3'] 2199 >>> print(new_table.ids()) 2200 ['S1' 'S2'] 2201 2202 Using the same filtering function, discard all samples with sample_type 2203 'a'. This will keep only sample S3, which has sample_type 'b': 2204 2205 >>> new_table = table.filter(filter_fn, inplace=False, invert=True) 2206 >>> print(table.ids()) 2207 ['S1' 'S2' 'S3'] 2208 >>> print(new_table.ids()) 2209 ['S3'] 2210 2211 Filter the table in-place using the same function (drop all samples 2212 where sample_type is not 'a'): 2213 2214 >>> table.filter(filter_fn) 2215 2 x 2 <class 'biom.table.Table'> with 2 nonzero entries (50% dense) 2216 >>> print(table.ids()) 2217 ['S1' 'S2'] 2218 2219 Filter out all observations in the table that do not have 2220 full_genome_available == True. This will filter out observation O2: 2221 2222 >>> filter_fn = lambda val, id_, md: md['full_genome_available'] 2223 >>> table.filter(filter_fn, axis='observation') 2224 1 x 2 <class 'biom.table.Table'> with 0 nonzero entries (0% dense) 2225 >>> print(table.ids(axis='observation')) 2226 ['O1'] 2227 2228 """ 2229 table = self if inplace else self.copy() 2230 2231 metadata = table.metadata(axis=axis) 2232 ids = table.ids(axis=axis) 2233 index = self._index(axis=axis) 2234 axis = table._axis_to_num(axis=axis) 2235 2236 arr = table._data 2237 arr, ids, metadata = _filter(arr, 2238 ids, 2239 metadata, 2240 index, 2241 ids_to_keep, 2242 axis, 2243 invert=invert) 2244 2245 table._data = arr 2246 if axis == 1: 2247 table._sample_ids = ids 2248 table._sample_metadata = metadata 2249 elif axis == 0: 2250 table._observation_ids = ids 2251 table._observation_metadata = metadata 2252 2253 table._index_ids() 2254 errcheck(table) 2255 2256 return table 2257 2258 def partition(self, f, axis='sample'): 2259 """Yields partitions 2260 2261 Parameters 2262 ---------- 2263 f : function 2264 `f` is given the ID and metadata of the vector and must return 2265 what partition the vector is part of. 2266 axis : {'sample', 'observation'}, optional 2267 The axis to iterate over 2268 2269 Returns 2270 ------- 2271 GeneratorType 2272 A generator that yields (partition, `Table`) 2273 2274 Examples 2275 -------- 2276 >>> import numpy as np 2277 >>> from biom.table import Table 2278 >>> from biom.util import unzip 2279 2280 Create a 2x3 BIOM table, with observation metadata and sample 2281 metadata: 2282 2283 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 2284 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'], 2285 ... [{'full_genome_available': True}, 2286 ... {'full_genome_available': False}], 2287 ... [{'sample_type': 'a'}, {'sample_type': 'a'}, 2288 ... {'sample_type': 'b'}]) 2289 2290 Define a function to bin by sample_type 2291 2292 >>> f = lambda id_, md: md['sample_type'] 2293 2294 Partition the table and view results 2295 2296 >>> bins, tables = table.partition(f) 2297 >>> print(bins[1]) # doctest: +NORMALIZE_WHITESPACE 2298 # Constructed from biom file 2299 #OTU ID S1 S2 2300 O1 0.0 0.0 2301 O2 1.0 3.0 2302 >>> print(tables[1]) # doctest: +NORMALIZE_WHITESPACE 2303 # Constructed from biom file 2304 #OTU ID S3 2305 O1 1.0 2306 O2 42.0 2307 """ 2308 partitions = {} 2309 # conversion of vector types is not necessary, vectors are not 2310 # being passed to an arbitrary function 2311 for vals, id_, md in self.iter(dense=False, axis=axis): 2312 part = f(id_, md) 2313 2314 # try to make it hashable... 2315 if not isinstance(part, Hashable): 2316 part = tuple(part) 2317 2318 if part not in partitions: 2319 partitions[part] = [[], [], []] 2320 2321 partitions[part][0].append(id_) 2322 partitions[part][1].append(vals) 2323 partitions[part][2].append(md) 2324 2325 md = self.metadata(axis=self._invert_axis(axis)) 2326 2327 for part, (ids, values, metadata) in viewitems(partitions): 2328 if axis == 'sample': 2329 data = self._conv_to_self_type(values, transpose=True) 2330 samp_ids = ids 2331 samp_md = metadata 2332 obs_ids = self.ids(axis='observation')[:] 2333 obs_md = md[:] if md is not None else None 2334 2335 elif axis == 'observation': 2336 data = self._conv_to_self_type(values, transpose=False) 2337 obs_ids = ids 2338 obs_md = metadata 2339 samp_ids = self.ids()[:] 2340 samp_md = md[:] if md is not None else None 2341 2342 yield part, Table(data, obs_ids, samp_ids, obs_md, samp_md, 2343 self.table_id, type=self.type) 2344 2345 def collapse(self, f, collapse_f=None, norm=True, min_group_size=1, 2346 include_collapsed_metadata=True, one_to_many=False, 2347 one_to_many_mode='add', one_to_many_md_key='Path', 2348 strict=False, axis='sample'): 2349 """Collapse partitions in a table by metadata or by IDs 2350 2351 Partition data by metadata or IDs and then collapse each partition into 2352 a single vector. 2353 2354 If `include_collapsed_metadata` is ``True``, the metadata for the 2355 collapsed partition will be a category named 'collapsed_ids', in which 2356 a list of the original ids that made up the partition is retained 2357 2358 The remainder is only relevant to setting `one_to_many` to ``True``. 2359 2360 If `one_to_many` is ``True``, allow vectors to collapse into multiple 2361 bins if the metadata describe a one-many relationship. Supplied 2362 functions must allow for iteration support over the metadata key and 2363 must return a tuple of (path, bin) as to describe both the path in the 2364 hierarchy represented and the specific bin being collapsed into. The 2365 uniqueness of the bin is _not_ based on the path but by the name of the 2366 bin. 2367 2368 The metadata value for the corresponding collapsed column may include 2369 more (or less) information about the collapsed data. For example, if 2370 collapsing "FOO", and there are vectors that span three associations A, 2371 B, and C, such that vector 1 spans A and B, vector 2 spans B and C and 2372 vector 3 spans A and C, the resulting table will contain three 2373 collapsed vectors: 2374 2375 - A, containing original vectors 1 and 3 2376 - B, containing original vectors 1 and 2 2377 - C, containing original vectors 2 and 3 2378 2379 If a vector maps to the same partition multiple times, it will be 2380 counted multiple times. 2381 2382 There are two supported modes for handling one-to-many relationships 2383 via `one_to_many_mode`: ``add`` and `divide`. ``add`` will add the 2384 vector counts to each partition that the vector maps to, which may 2385 increase the total number of counts in the output table. ``divide`` 2386 will divide a vectors's counts by the number of metadata that the 2387 vector has before adding the counts to each partition. This will not 2388 increase the total number of counts in the output table. 2389 2390 If `one_to_many_md_key` is specified, that becomes the metadata 2391 key that describes the collapsed path. If a value is not specified, 2392 then it defaults to 'Path'. 2393 2394 If `strict` is specified, then all metadata pathways operated on 2395 must be indexable by `metadata_f`. 2396 2397 `one_to_many` and `norm` are not supported together. 2398 2399 `one_to_many` and `collapse_f` are not supported together. 2400 2401 `one_to_many` and `min_group_size` are not supported together. 2402 2403 A final note on space consumption. At present, the `one_to_many` 2404 functionality requires a temporary dense matrix representation. 2405 2406 Parameters 2407 ---------- 2408 f : function 2409 Function that is used to determine what partition a vector belongs 2410 to 2411 collapse_f : function, optional 2412 Function that collapses a partition in a one-to-one collapse. The 2413 expected function signature is: 2414 2415 dense or sparse_vector <- collapse_f(Table, axis) 2416 2417 Defaults to a pairwise add. 2418 2419 norm : bool, optional 2420 Defaults to ``True``. If ``True``, normalize the resulting table 2421 min_group_size : int, optional 2422 Defaults to ``1``. The minimum size of a partition when performing 2423 a one-to-one collapse 2424 include_collapsed_metadata : bool, optional 2425 Defaults to ``True``. If ``True``, retain the collapsed metadata 2426 keyed by the original IDs of the associated vectors 2427 one_to_many : bool, optional 2428 Defaults to ``False``. Perform a one-to-many collapse 2429 one_to_many_mode : {'add', 'divide'}, optional 2430 The way to reduce two vectors in a one-to-many collapse 2431 one_to_many_md_key : str, optional 2432 Defaults to "Path". If `include_collapsed_metadata` is ``True``, 2433 store the original vector metadata under this key 2434 strict : bool, optional 2435 Defaults to ``False``. Requires full pathway data within a 2436 one-to-many structure 2437 axis : {'sample', 'observation'}, optional 2438 The axis to collapse 2439 2440 Returns 2441 ------- 2442 Table 2443 The collapsed table 2444 2445 Examples 2446 -------- 2447 >>> import numpy as np 2448 >>> from biom.table import Table 2449 2450 Create a ``Table`` 2451 2452 >>> dt_rich = Table( 2453 ... np.array([[5, 6, 7], [8, 9, 10], [11, 12, 13]]), 2454 ... ['1', '2', '3'], ['a', 'b', 'c'], 2455 ... [{'taxonomy': ['k__a', 'p__b']}, 2456 ... {'taxonomy': ['k__a', 'p__c']}, 2457 ... {'taxonomy': ['k__a', 'p__c']}], 2458 ... [{'barcode': 'aatt'}, 2459 ... {'barcode': 'ttgg'}, 2460 ... {'barcode': 'aatt'}]) 2461 >>> print(dt_rich) # doctest: +NORMALIZE_WHITESPACE 2462 # Constructed from biom file 2463 #OTU ID a b c 2464 1 5.0 6.0 7.0 2465 2 8.0 9.0 10.0 2466 3 11.0 12.0 13.0 2467 2468 Create Function to determine what partition a vector belongs to 2469 2470 >>> bin_f = lambda id_, x: x['taxonomy'][1] 2471 >>> obs_phy = dt_rich.collapse( 2472 ... bin_f, norm=False, min_group_size=1, 2473 ... axis='observation').sort(axis='observation') 2474 >>> print(obs_phy) # doctest: +NORMALIZE_WHITESPACE 2475 # Constructed from biom file 2476 #OTU ID a b c 2477 p__b 5.0 6.0 7.0 2478 p__c 19.0 21.0 23.0 2479 """ 2480 collapsed_data = [] 2481 collapsed_ids = [] 2482 2483 if include_collapsed_metadata: 2484 collapsed_md = [] 2485 else: 2486 collapsed_md = None 2487 2488 if one_to_many_mode not in ['add', 'divide']: 2489 raise ValueError("Unrecognized one-to-many mode '%s'. Must be " 2490 "either 'add' or 'divide'." % one_to_many_mode) 2491 2492 # transpose is only necessary in the one-to-one case 2493 # new_data_shape is only necessary in the one-to-many case 2494 # axis_slice is only necessary in the one-to-many case 2495 def axis_ids_md(t): 2496 return (t.ids(axis=axis), t.metadata(axis=axis)) 2497 2498 if axis == 'sample': 2499 transpose = True 2500 2501 def new_data_shape(ids, collapsed): 2502 return (len(ids), len(collapsed)) 2503 2504 def axis_slice(lookup, key): 2505 return (slice(None), lookup[key]) 2506 2507 elif axis == 'observation': 2508 transpose = False 2509 2510 def new_data_shape(ids, collapsed): 2511 return (len(collapsed), len(ids)) 2512 2513 def axis_slice(lookup, key): 2514 return (lookup[key], slice(None)) 2515 2516 else: 2517 raise UnknownAxisError(axis) 2518 2519 if one_to_many: 2520 if norm: 2521 raise AttributeError( 2522 "norm and one_to_many are not supported together") 2523 2524 # determine the collapsed pathway 2525 # we drop all other associated metadata 2526 new_md = {} 2527 md_count = {} 2528 2529 for id_, md in zip(*axis_ids_md(self)): 2530 md_iter = f(id_, md) 2531 num_md = 0 2532 while True: 2533 try: 2534 pathway, partition = next(md_iter) 2535 except IndexError: 2536 # if a pathway is incomplete 2537 if strict: 2538 # bail if strict 2539 err = "Incomplete pathway, ID: %s, metadata: %s" %\ 2540 (id_, md) 2541 raise IndexError(err) 2542 else: 2543 # otherwise ignore 2544 continue 2545 except StopIteration: 2546 break 2547 2548 new_md[partition] = pathway 2549 num_md += 1 2550 2551 md_count[id_] = num_md 2552 2553 idx_lookup = {part: i for i, part in enumerate(sorted(new_md))} 2554 2555 # We need to store floats, not ints, as things won't always divide 2556 # evenly. 2557 dtype = float if one_to_many_mode == 'divide' else self.dtype 2558 2559 new_data = zeros(new_data_shape(self.ids(self._invert_axis(axis)), 2560 new_md), 2561 dtype=dtype) 2562 2563 # for each vector 2564 # for each bin in the metadata 2565 # for each partition associated with the vector 2566 for vals, id_, md in self.iter(axis=axis): 2567 md_iter = f(id_, md) 2568 2569 while True: 2570 try: 2571 pathway, part = next(md_iter) 2572 except IndexError: 2573 # if a pathway is incomplete 2574 if strict: 2575 # bail if strict, should never get here... 2576 err = "Incomplete pathway, ID: %s, metadata: %s" %\ 2577 (id_, md) 2578 raise IndexError(err) 2579 else: 2580 # otherwise ignore 2581 continue 2582 except StopIteration: 2583 break 2584 if one_to_many_mode == 'add': 2585 new_data[axis_slice(idx_lookup, part)] += vals 2586 else: 2587 new_data[axis_slice(idx_lookup, part)] += \ 2588 vals / md_count[id_] 2589 2590 if include_collapsed_metadata: 2591 # reassociate pathway information 2592 for k, i in sorted(viewitems(idx_lookup), key=itemgetter(1)): 2593 collapsed_md.append({one_to_many_md_key: new_md[k]}) 2594 2595 # get the new sample IDs 2596 collapsed_ids = [k for k, i in sorted(viewitems(idx_lookup), 2597 key=itemgetter(1))] 2598 2599 # convert back to self type 2600 data = self._conv_to_self_type(new_data) 2601 else: 2602 if collapse_f is None: 2603 def collapse_f(t, axis): 2604 return t.sum(axis) 2605 2606 for part, table in self.partition(f, axis=axis): 2607 axis_ids, axis_md = axis_ids_md(table) 2608 2609 if len(axis_ids) < min_group_size: 2610 continue 2611 2612 redux_data = collapse_f(table, self._invert_axis(axis)) 2613 if norm: 2614 redux_data /= len(axis_ids) 2615 2616 collapsed_data.append(self._conv_to_self_type(redux_data)) 2617 collapsed_ids.append(part) 2618 2619 if include_collapsed_metadata: 2620 # retain metadata but store by original id 2621 collapsed_md.append({'collapsed_ids': axis_ids.tolist()}) 2622 2623 data = self._conv_to_self_type(collapsed_data, transpose=transpose) 2624 2625 # if the table is empty 2626 errcheck(self, 'empty') 2627 2628 md = self.metadata(axis=self._invert_axis(axis)) 2629 if axis == 'sample': 2630 sample_ids = collapsed_ids 2631 sample_md = collapsed_md 2632 obs_ids = self.ids(axis='observation')[:] 2633 obs_md = md if md is not None else None 2634 else: 2635 sample_ids = self.ids()[:] 2636 obs_ids = collapsed_ids 2637 obs_md = collapsed_md 2638 sample_md = md if md is not None else None 2639 2640 return Table(data, obs_ids, sample_ids, obs_md, sample_md, 2641 self.table_id, type=self.type) 2642 2643 def _invert_axis(self, axis): 2644 """Invert an axis""" 2645 if axis == 'sample': 2646 return 'observation' 2647 elif axis == 'observation': 2648 return 'sample' 2649 else: 2650 return UnknownAxisError(axis) 2651 2652 def _axis_to_num(self, axis): 2653 """Convert str axis to numerical axis""" 2654 if axis == 'sample': 2655 return 1 2656 elif axis == 'observation': 2657 return 0 2658 else: 2659 raise UnknownAxisError(axis) 2660 2661 def min(self, axis='sample'): 2662 """Get the minimum nonzero value over an axis 2663 2664 Parameters 2665 ---------- 2666 axis : {'sample', 'observation', 'whole'}, optional 2667 Defaults to "sample". The axis over which to calculate minima. 2668 2669 Returns 2670 ------- 2671 scalar of self.dtype or np.array of self.dtype 2672 2673 Raises 2674 ------ 2675 UnknownAxisError 2676 If provided an unrecognized axis. 2677 2678 Examples 2679 -------- 2680 >>> from biom import example_table 2681 >>> print(example_table.min(axis='sample')) 2682 [ 3. 1. 2.] 2683 2684 """ 2685 if axis not in ['sample', 'observation', 'whole']: 2686 raise UnknownAxisError(axis) 2687 2688 if axis == 'whole': 2689 min_val = np.inf 2690 for data in self.iter_data(dense=False): 2691 # only min over the actual nonzero values 2692 min_val = min(min_val, data.data.min()) 2693 else: 2694 min_val = zeros(len(self.ids(axis=axis)), dtype=self.dtype) 2695 2696 for idx, data in enumerate(self.iter_data(dense=False, axis=axis)): 2697 min_val[idx] = data.data.min() 2698 2699 return min_val 2700 2701 def max(self, axis='sample'): 2702 """Get the maximum nonzero value over an axis 2703 2704 Parameters 2705 ---------- 2706 axis : {'sample', 'observation', 'whole'}, optional 2707 Defaults to "sample". The axis over which to calculate maxima. 2708 2709 Returns 2710 ------- 2711 scalar of self.dtype or np.array of self.dtype 2712 2713 Raises 2714 ------ 2715 UnknownAxisError 2716 If provided an unrecognized axis. 2717 2718 Examples 2719 -------- 2720 >>> from biom import example_table 2721 >>> print(example_table.max(axis='observation')) 2722 [ 2. 5.] 2723 2724 """ 2725 if axis not in ['sample', 'observation', 'whole']: 2726 raise UnknownAxisError(axis) 2727 2728 if axis == 'whole': 2729 max_val = -np.inf 2730 for data in self.iter_data(dense=False): 2731 # only min over the actual nonzero values 2732 max_val = max(max_val, data.data.max()) 2733 else: 2734 max_val = np.empty(len(self.ids(axis=axis)), dtype=self.dtype) 2735 2736 for idx, data in enumerate(self.iter_data(dense=False, axis=axis)): 2737 max_val[idx] = data.data.max() 2738 2739 return max_val 2740 2741 def subsample(self, n, axis='sample', by_id=False, with_replacement=False): 2742 """Randomly subsample without replacement. 2743 2744 Parameters 2745 ---------- 2746 n : int 2747 Number of items to subsample from `counts`. 2748 axis : {'sample', 'observation'}, optional 2749 The axis to sample over 2750 by_id : boolean, optional 2751 If `False`, the subsampling is based on the counts contained in the 2752 matrix (e.g., rarefaction). If `True`, the subsampling is based on 2753 the IDs (e.g., fetch a random subset of samples). Default is 2754 `False`. 2755 with_replacement : boolean, optional 2756 If `False` (default), subsample without replacement. If `True`, 2757 resample with replacement via the multinomial distribution. 2758 Should not be `True` if `by_id` is `True`. 2759 2760 Returns 2761 ------- 2762 biom.Table 2763 A subsampled version of self 2764 2765 Raises 2766 ------ 2767 ValueError 2768 - If `n` is less than zero. 2769 - If `by_id` and `with_replacement` are both True. 2770 2771 Notes 2772 ----- 2773 Subsampling is performed without replacement. If `n` is greater than 2774 the sum of a given vector, that vector is omitted from the result. 2775 2776 Adapted from `skbio.math.subsample`, see biom-format/licenses for more 2777 information about scikit-bio. 2778 2779 This code assumes absolute abundance if `by_id` is False. 2780 2781 Examples 2782 -------- 2783 >>> import numpy as np 2784 >>> from biom.table import Table 2785 >>> table = Table(np.array([[0, 2, 3], [1, 0, 2]]), ['O1', 'O2'], 2786 ... ['S1', 'S2', 'S3']) 2787 2788 Subsample 1 item over the sample axis by value (e.g., rarefaction): 2789 2790 >>> print(table.subsample(1).sum(axis='sample')) 2791 [ 1. 1. 1.] 2792 2793 Subsample 2 items over the sample axis, note that 'S1' is filtered out: 2794 2795 >>> ss = table.subsample(2) 2796 >>> print(ss.sum(axis='sample')) 2797 [ 2. 2.] 2798 >>> print(ss.ids()) 2799 ['S2' 'S3'] 2800 2801 Subsample by IDs over the sample axis. For this example, we're going to 2802 randomly select 2 samples and do this 100 times, and then print out the 2803 set of IDs observed. 2804 2805 >>> ids = set([tuple(table.subsample(2, by_id=True).ids()) 2806 ... for i in range(100)]) 2807 >>> print(sorted(ids)) 2808 [('S1', 'S2'), ('S1', 'S3'), ('S2', 'S3')] 2809 2810 """ 2811 if n < 0: 2812 raise ValueError("n cannot be negative.") 2813 2814 if with_replacement and by_id: 2815 raise ValueError("by_id and with_replacement cannot both be True") 2816 2817 table = self.copy() 2818 2819 if by_id: 2820 ids = table.ids(axis=axis).copy() 2821 np.random.shuffle(ids) 2822 subset = set(ids[:n]) 2823 table.filter(lambda v, i, md: i in subset, axis=axis) 2824 else: 2825 data = table._get_sparse_data() 2826 _subsample(data, n, with_replacement) 2827 table._data = data 2828 2829 table.filter(lambda v, i, md: v.sum() > 0, axis=axis) 2830 2831 inv_axis = self._invert_axis(axis) 2832 table.filter(lambda v, i, md: v.sum() > 0, axis=inv_axis) 2833 2834 return table 2835 2836 def pa(self, inplace=True): 2837 """Convert the table to presence/absence data 2838 2839 Parameters 2840 ---------- 2841 inplace : bool, optional 2842 Defaults to ``True`` 2843 2844 Returns 2845 ------- 2846 Table 2847 Returns itself if `inplace`, else returns a new presence/absence 2848 table. 2849 2850 Examples 2851 -------- 2852 >>> from biom.table import Table 2853 >>> import numpy as np 2854 2855 Create a 2x3 BIOM table 2856 2857 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 2858 >>> table = table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3']) 2859 2860 Convert to presence/absence data 2861 2862 >>> _ = table.pa() 2863 >>> print(table.data('O1', 'observation')) 2864 [ 0. 0. 1.] 2865 >>> print(table.data('O2', 'observation')) 2866 [ 1. 1. 1.] 2867 """ 2868 def transform_f(data, id_, metadata): 2869 return np.where(data != 0, 1., 0.) 2870 2871 return self.transform(transform_f, inplace=inplace) 2872 2873 def transform(self, f, axis='sample', inplace=True): 2874 """Iterate over `axis`, applying a function `f` to each vector. 2875 2876 Only non null values can be modified and the density of the 2877 table can't increase. However, zeroing values is fine. 2878 2879 Parameters 2880 ---------- 2881 f : function(data, id, metadata) -> new data 2882 A function that takes three values: an array of nonzero 2883 values corresponding to each observation or sample, an 2884 observation or sample id, and an observation or sample 2885 metadata entry. It must return an array of transformed 2886 values that replace the original values. 2887 axis : {'sample', 'observation'}, optional 2888 The axis to operate on. Can be "sample" or "observation". 2889 inplace : bool, optional 2890 Defaults to ``True``. Whether to return a new table or modify 2891 itself. 2892 2893 Returns 2894 ------- 2895 biom.Table 2896 Returns itself if `inplace`, else returns a new transformed table. 2897 2898 Raises 2899 ------ 2900 UnknownAxisError 2901 If provided an unrecognized axis. 2902 2903 Examples 2904 -------- 2905 >>> import numpy as np 2906 >>> from biom.table import Table 2907 2908 Create a 2x3 table 2909 2910 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 2911 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'], 2912 ... [{'foo': 'bar'}, {'x': 'y'}], None) 2913 >>> print(table) # doctest: +NORMALIZE_WHITESPACE 2914 # Constructed from biom file 2915 #OTU ID S1 S2 S3 2916 O1 0.0 0.0 1.0 2917 O2 1.0 3.0 42.0 2918 2919 Create a transform function 2920 2921 >>> f = lambda data, id_, md: data / 2 2922 2923 Transform to a new table on samples 2924 2925 >>> table2 = table.transform(f, 'sample', False) 2926 >>> print(table2) # doctest: +NORMALIZE_WHITESPACE 2927 # Constructed from biom file 2928 #OTU ID S1 S2 S3 2929 O1 0.0 0.0 0.5 2930 O2 0.5 1.5 21.0 2931 2932 `table` hasn't changed 2933 2934 >>> print(table) # doctest: +NORMALIZE_WHITESPACE 2935 # Constructed from biom file 2936 #OTU ID S1 S2 S3 2937 O1 0.0 0.0 1.0 2938 O2 1.0 3.0 42.0 2939 2940 Tranform in place on observations 2941 2942 >>> table3 = table.transform(f, 'observation', True) 2943 2944 `table` is different now 2945 2946 >>> print(table) # doctest: +NORMALIZE_WHITESPACE 2947 # Constructed from biom file 2948 #OTU ID S1 S2 S3 2949 O1 0.0 0.0 0.5 2950 O2 0.5 1.5 21.0 2951 2952 but the table returned (`table3`) is the same as `table` 2953 2954 >>> print(table3) # doctest: +NORMALIZE_WHITESPACE 2955 # Constructed from biom file 2956 #OTU ID S1 S2 S3 2957 O1 0.0 0.0 0.5 2958 O2 0.5 1.5 21.0 2959 2960 """ 2961 table = self if inplace else self.copy() 2962 2963 metadata = table.metadata(axis=axis) 2964 ids = table.ids(axis=axis) 2965 arr = table._get_sparse_data(axis=axis) 2966 2967 axis = table._axis_to_num(axis) 2968 2969 _transform(arr, ids, metadata, f, axis) 2970 arr.eliminate_zeros() 2971 2972 table._data = arr 2973 2974 return table 2975 2976 def rankdata(self, axis='sample', inplace=True, method='average'): 2977 """Convert values to rank abundances from smallest to largest 2978 2979 Parameters 2980 ---------- 2981 axis : {'sample', 'observation'}, optional 2982 The axis to use for ranking. 2983 inplace : bool, optional 2984 Defaults to ``True``. If ``True``, performs the ranking in 2985 place. Otherwise, returns a new table with ranking applied. 2986 method : str, optional 2987 The method for handling ties in counts. This can be any valid 2988 string that can be passed to `scipy.stats.rankdata`. 2989 2990 Returns 2991 ------- 2992 biom.Table 2993 The rank-abundance-transformed table. 2994 2995 Raises 2996 ------ 2997 ValueError 2998 If unknown ``method`` is provided. 2999 3000 See Also 3001 -------- 3002 scipy.stats.rankdata 3003 3004 Examples 3005 -------- 3006 >>> import numpy as np 3007 >>> from biom import Table 3008 >>> data = np.array([[ 99, 12, 8], [ 0, 42, 7], 3009 ... [112, 42, 6], [ 5, 75, 5]]) 3010 >>> t = Table(data, sample_ids=['s1', 's2', 's3'], 3011 ... observation_ids=['o1', 'o2', 'o3', 'o4']) 3012 3013 Convert observation counts to their ranked abundance, from smallest 3014 to largest. 3015 3016 >>> print(t.rankdata()) # doctest: +NORMALIZE_WHITESPACE 3017 # Constructed from biom file 3018 #OTU ID s1 s2 s3 3019 o1 2.0 1.0 4.0 3020 o2 0.0 2.5 3.0 3021 o3 3.0 2.5 2.0 3022 o4 1.0 4.0 1.0 3023 3024 """ 3025 def f(val, id_, _): 3026 return scipy.stats.rankdata(val, method=method) 3027 return self.transform(f, axis=axis, inplace=inplace) 3028 3029 def norm(self, axis='sample', inplace=True): 3030 """Normalize in place sample values by an observation, or vice versa. 3031 3032 Parameters 3033 ---------- 3034 axis : {'sample', 'observation'}, optional 3035 The axis to use for normalization. 3036 inplace : bool, optional 3037 Defaults to ``True``. If ``True``, performs the normalization in 3038 place. Otherwise, returns a new table with the normalization 3039 applied. 3040 3041 Returns 3042 ------- 3043 biom.Table 3044 The normalized table 3045 3046 Examples 3047 -------- 3048 >>> import numpy as np 3049 >>> from biom.table import Table 3050 3051 Create a 2x2 table: 3052 3053 >>> data = np.asarray([[2, 0], [6, 1]]) 3054 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2']) 3055 3056 Get a version of the table normalized on the 'sample' axis, leaving the 3057 original table untouched: 3058 3059 >>> new_table = table.norm(inplace=False) 3060 >>> print(table) # doctest: +NORMALIZE_WHITESPACE 3061 # Constructed from biom file 3062 #OTU ID S1 S2 3063 O1 2.0 0.0 3064 O2 6.0 1.0 3065 >>> print(new_table) # doctest: +NORMALIZE_WHITESPACE 3066 # Constructed from biom file 3067 #OTU ID S1 S2 3068 O1 0.25 0.0 3069 O2 0.75 1.0 3070 3071 Get a version of the table normalized on the 'observation' axis, 3072 again leaving the original table untouched: 3073 3074 >>> new_table = table.norm(axis='observation', inplace=False) 3075 >>> print(table) # doctest: +NORMALIZE_WHITESPACE 3076 # Constructed from biom file 3077 #OTU ID S1 S2 3078 O1 2.0 0.0 3079 O2 6.0 1.0 3080 >>> print(new_table) # doctest: +NORMALIZE_WHITESPACE 3081 # Constructed from biom file 3082 #OTU ID S1 S2 3083 O1 1.0 0.0 3084 O2 0.857142857143 0.142857142857 3085 3086 Do the same normalization on 'observation', this time in-place: 3087 3088 >>> table.norm(axis='observation') 3089 2 x 2 <class 'biom.table.Table'> with 3 nonzero entries (75% dense) 3090 >>> print(table) # doctest: +NORMALIZE_WHITESPACE 3091 # Constructed from biom file 3092 #OTU ID S1 S2 3093 O1 1.0 0.0 3094 O2 0.857142857143 0.142857142857 3095 """ 3096 def f(val, id_, _): 3097 return val / float(val.sum()) 3098 3099 return self.transform(f, axis=axis, inplace=inplace) 3100 3101 def nonzero(self): 3102 """Yields locations of nonzero elements within the data matrix 3103 3104 Returns 3105 ------- 3106 generator 3107 Yields ``(observation_id, sample_id)`` for each nonzero element 3108 """ 3109 csr = self._data.tocsr() 3110 samp_ids = self.ids() 3111 obs_ids = self.ids(axis='observation') 3112 3113 indptr = csr.indptr 3114 indices = csr.indices 3115 for row_idx in range(indptr.size - 1): 3116 start = indptr[row_idx] 3117 end = indptr[row_idx+1] 3118 3119 obs_id = obs_ids[row_idx] 3120 for col_idx in indices[start:end]: 3121 yield (obs_id, samp_ids[col_idx]) 3122 3123 def nonzero_counts(self, axis, binary=True): 3124 """Get nonzero summaries about an axis 3125 3126 Parameters 3127 ---------- 3128 axis : {'sample', 'observation', 'whole'} 3129 The axis on which to count nonzero entries 3130 binary : bool, optional 3131 Defaults to ``True``. If ``True``, return number of nonzero 3132 entries. If ``False``, sum the values of the entries. 3133 3134 Returns 3135 ------- 3136 numpy.array 3137 Counts in index order to the axis 3138 """ 3139 if binary: 3140 dtype = 'int' 3141 3142 def op(x): 3143 return x.nonzero()[0].size 3144 else: 3145 dtype = self.dtype 3146 3147 def op(x): 3148 return x.sum() 3149 3150 if axis in ('sample', 'observation'): 3151 # can use np.bincount for CSMat or ScipySparse 3152 result = zeros(len(self.ids(axis=axis)), dtype=dtype) 3153 for idx, vals in enumerate(self.iter_data(axis=axis)): 3154 result[idx] = op(vals) 3155 else: 3156 result = zeros(1, dtype=dtype) 3157 for vals in self.iter_data(): 3158 result[0] += op(vals) 3159 3160 return result 3161 3162 def _union_id_order(self, a, b): 3163 """Determines merge order for id lists A and B""" 3164 all_ids = list(a[:]) 3165 all_ids.extend(b[:]) 3166 new_order = {} 3167 idx = 0 3168 for id_ in all_ids: 3169 if id_ not in new_order: 3170 new_order[id_] = idx 3171 idx += 1 3172 return new_order 3173 3174 def _intersect_id_order(self, a, b): 3175 """Determines the merge order for id lists A and B""" 3176 all_b = set(b[:]) 3177 new_order = {} 3178 idx = 0 3179 for id_ in a: 3180 if id_ in all_b: 3181 new_order[id_] = idx 3182 idx += 1 3183 return new_order 3184 3185 def remove_empty(self, axis='whole', inplace=True): 3186 """Remove empty samples or observations from the table 3187 3188 Parameters 3189 ---------- 3190 axis : {'whole', 'sample', 'observation'}, optional 3191 The axis on which to operate. 3192 inplace : bool, optional 3193 If ``True`` vectors are removed in ``self``; if ``False`` the 3194 vectors are removed in a new table is returned. 3195 3196 Raises 3197 ------ 3198 UnknownAxisError 3199 If the axis is not recognized. 3200 3201 Returns 3202 ------- 3203 Table 3204 A table object with the zero'd rows, or columns removed as 3205 specified by the `axis` parameter. 3206 """ 3207 if axis not in ['sample', 'observation', 'whole']: 3208 raise UnknownAxisError(axis) 3209 3210 if inplace: 3211 table = self 3212 else: 3213 table = self.copy() 3214 3215 if axis == 'whole': 3216 axes = ['sample', 'observation'] 3217 else: 3218 axes = [axis] 3219 3220 for ax in axes: 3221 table.filter(table.ids(axis=ax)[table.sum(axis=ax) > 0], axis=ax) 3222 3223 return table 3224 3225 def align_to(self, other, axis='detect'): 3226 """Align self to other over a requested axis 3227 3228 Parameters 3229 ---------- 3230 other : biom.Table 3231 The table to align too 3232 axis : str, optional, {sample, observation, both, detect} 3233 If 'sample' or 'observation', align to that axis. If 'both', align 3234 both axes. If 'detect', align what can be aligned. 3235 3236 Raises 3237 ------ 3238 DisjointIDError 3239 If the requested axis can't be aligned. 3240 UnknownAxisError 3241 If an unrecognized axis is specified. 3242 3243 Examples 3244 -------- 3245 Align one table to another, for instance a table of 16S data to a table 3246 of metagenomic data. In this example, we're aligning the samples of the 3247 two tables. 3248 3249 >>> from biom import Table 3250 >>> import numpy as np 3251 >>> amplicon = Table(np.array([[0, 1, 2], [3, 4, 5]]), 3252 ... ['Ecoli', 'Staphylococcus'], 3253 ... ['S1', 'S2', 'S3']) 3254 >>> metag = Table(np.array([[6, 7, 8], [9, 10, 11]]), 3255 ... ['geneA', 'geneB'], 3256 ... ['S3', 'S2', 'S1']) 3257 >>> amplicon = amplicon.align_to(metag) 3258 >>> print(amplicon) # doctest: +NORMALIZE_WHITESPACE 3259 # Constructed from biom file 3260 #OTU ID S3 S2 S1 3261 Ecoli 2.0 1.0 0.0 3262 Staphylococcus 5.0 4.0 3.0 3263 """ 3264 self_o = set(self.ids(axis='observation')) 3265 self_s = set(self.ids()) 3266 other_o = set(other.ids(axis='observation')) 3267 other_s = set(other.ids()) 3268 3269 alignable_o = self_o == other_o 3270 alignable_s = self_s == other_s 3271 3272 if axis == 'both' and not (alignable_o and alignable_s): 3273 raise DisjointIDError("Cannot align both axes") 3274 elif axis == 'sample' and not alignable_s: 3275 raise DisjointIDError("Cannot align samples") 3276 elif axis == 'observation' and not alignable_o: 3277 raise DisjointIDError("Cannot align observations") 3278 elif axis == 'detect' and not (alignable_o or alignable_s): 3279 raise DisjointIDError("Neither axis appears alignable") 3280 3281 if axis == 'both': 3282 order = ['observation', 'sample'] 3283 elif axis == 'detect': 3284 order = [] 3285 if alignable_s: 3286 order.append('sample') 3287 if alignable_o: 3288 order.append('observation') 3289 elif axis == 'sample': 3290 order = ['sample'] 3291 elif axis == 'observation': 3292 order = ['observation'] 3293 else: 3294 raise UnknownAxisError("Unrecognized axis: %s" % axis) 3295 3296 table = self 3297 for aln_axis in order: 3298 table = table.sort_order(other.ids(axis=aln_axis), 3299 axis=aln_axis) 3300 3301 return table 3302 3303 def concat(self, others, axis='sample'): 3304 """Concatenate tables if axis is disjoint 3305 3306 Parameters 3307 ---------- 3308 others : iterable of biom.Table, or a single biom.Table instance 3309 Tables to concatenate 3310 axis : {'sample', 'observation'}, optional 3311 The axis to concatenate on. i.e., if axis is 'sample', then tables 3312 will be joined such that the set of sample IDs in the resulting 3313 table will be the union of sample IDs across all tables in others. 3314 3315 Raises 3316 ------ 3317 DisjointIDError 3318 If IDs over the axis are not disjoint. 3319 3320 Notes 3321 ----- 3322 The type of the table is inherited from self. 3323 3324 Examples 3325 -------- 3326 Concatenate three tables in which the sample IDs are disjoint. Note 3327 the observation IDs in this example are not disjoint (although they 3328 can be): 3329 3330 >>> from biom import Table 3331 >>> import numpy as np 3332 >>> a = Table(np.array([[0, 1, 2], [3, 4, 5]]), ['O1', 'O2'], 3333 ... ['S1', 'S2', 'S3'], 3334 ... [{'taxonomy': 'foo'}, {'taxonomy': 'bar'}]) 3335 >>> b = Table(np.array([[6, 7, 8], [9, 10, 11]]), ['O3', 'O4'], 3336 ... ['S4', 'S5', 'S6'], 3337 ... [{'taxonomy': 'baz'}, {'taxonomy': 'foobar'}]) 3338 >>> c = Table(np.array([[12, 13, 14], [15, 16, 17]]), ['O1', 'O5'], 3339 ... ['S7', 'S8', 'S9'], 3340 ... [{'taxonomy': 'foo'}, {'taxonomy': 'biz'}]) 3341 >>> d = a.concat([b, c]) 3342 >>> print(d) # doctest: +NORMALIZE_WHITESPACE 3343 # Constructed from biom file 3344 #OTU ID S1 S2 S3 S4 S5 S6 S7 S8 S9 3345 O1 0.0 1.0 2.0 0.0 0.0 0.0 12.0 13.0 14.0 3346 O2 3.0 4.0 5.0 0.0 0.0 0.0 0.0 0.0 0.0 3347 O3 0.0 0.0 0.0 6.0 7.0 8.0 0.0 0.0 0.0 3348 O4 0.0 0.0 0.0 9.0 10.0 11.0 0.0 0.0 0.0 3349 O5 0.0 0.0 0.0 0.0 0.0 0.0 15.0 16.0 17.0 3350 3351 """ 3352 if isinstance(others, self.__class__): 3353 others = [others, ] 3354 3355 # we grow along the opposite axis 3356 invaxis = self._invert_axis(axis) 3357 if axis == 'sample': 3358 dim_getter = itemgetter(1) 3359 stack = hstack 3360 invstack = vstack 3361 else: 3362 dim_getter = itemgetter(0) 3363 stack = vstack 3364 invstack = hstack 3365 3366 axis_ids = set() 3367 invaxis_ids = set() 3368 invaxis_metadata = {} 3369 3370 all_tables = others[:] 3371 all_tables.insert(0, self) 3372 3373 # verify disjoint, and fetch all ids from all tables 3374 for table in all_tables: 3375 table_axis_ids = table.ids(axis=axis) 3376 table_invaxis_order = table.ids(axis=invaxis) 3377 table_invaxis = set(table_invaxis_order) 3378 3379 # test we have disjoint IDs 3380 if not axis_ids.isdisjoint(table_axis_ids): 3381 raise DisjointIDError("IDs are not disjoint") 3382 axis_ids.update(table_axis_ids) 3383 3384 if set(table_invaxis) - invaxis_ids: 3385 for i in (set(table_invaxis) - invaxis_ids): 3386 invaxis_metadata[i] = table.metadata(i, axis=invaxis) 3387 3388 # add to our perspective all inv axis IDs 3389 invaxis_ids.update(table_invaxis) 3390 3391 invaxis_order = sorted(invaxis_ids) 3392 3393 # determine what inv axis IDs do not exist per table, and pad and sort 3394 # as necessary 3395 padded_tables = [] 3396 for table in all_tables: 3397 missing_ids = list(invaxis_ids - set(table.ids(axis=invaxis))) 3398 3399 if missing_ids: 3400 # determine new shape 3401 n_invaxis = len(missing_ids) 3402 n_axis = len(table.ids(axis=axis)) 3403 if axis == 'sample': 3404 shape = (n_invaxis, n_axis) 3405 else: 3406 shape = (n_axis, n_invaxis) 3407 3408 # create the padded matrix 3409 zerod = csr_matrix(shape) 3410 tmp_mat = invstack([table.matrix_data, zerod]) 3411 3412 # resolve invert axis ids and metadata 3413 tmp_inv_ids = list(table.ids(axis=invaxis)) 3414 tmp_inv_ids.extend(missing_ids) 3415 tmp_inv_md = table.metadata(axis=invaxis) 3416 if tmp_inv_md is None: 3417 tmp_inv_md = [None] * len(table.ids(axis=invaxis)) 3418 else: 3419 tmp_inv_md = list(tmp_inv_md) 3420 tmp_inv_md.extend([invaxis_metadata[i] for i in missing_ids]) 3421 3422 # resolve axis ids and metadata 3423 tmp_ids = list(table.ids(axis=axis)) 3424 tmp_md = table.metadata(axis=axis) 3425 3426 # resolve construction based off axis. This really should be 3427 # pushed to a classmethod. 3428 if axis == 'sample': 3429 tmp_table = self.__class__(tmp_mat, tmp_inv_ids, tmp_ids, 3430 tmp_inv_md, tmp_md) 3431 else: 3432 tmp_table = self.__class__(tmp_mat, tmp_ids, tmp_inv_ids, 3433 tmp_md, tmp_inv_md) 3434 else: 3435 tmp_table = table 3436 3437 # sort the table if necessary 3438 if (tmp_table.ids(axis=invaxis) == invaxis_order).all(): 3439 padded_tables.append(tmp_table) 3440 else: 3441 padded_tables.append(tmp_table.sort_order(invaxis_order, 3442 axis=invaxis)) 3443 3444 # actually concatenate the matrices, IDs and metadata 3445 concat_mat = stack([t.matrix_data for t in padded_tables]) 3446 concat_ids = np.concatenate([t.ids(axis=axis) for t in padded_tables]) 3447 concat_md = [] 3448 for table in padded_tables: 3449 metadata = table.metadata(axis=axis) 3450 if metadata is None: 3451 metadata = [None] * dim_getter(table.shape) 3452 concat_md.extend(metadata) 3453 3454 # inverse axis sourced from whatever is in the first table 3455 inv_md = padded_tables[0].metadata(axis=invaxis) 3456 if axis == 'sample': 3457 concat = self.__class__(concat_mat, invaxis_order, concat_ids, 3458 inv_md, concat_md, type=self.type) 3459 else: 3460 concat = self.__class__(concat_mat, concat_ids, invaxis_order, 3461 concat_md, inv_md, type=self.type) 3462 3463 return concat 3464 3465 def _fast_merge(self, others): 3466 """For simple merge operations it is faster to aggregate using pandas 3467 3468 Parameters 3469 ---------- 3470 others : Table, or Iterable of Table 3471 If a Table, then merge with that table. If an iterable, then merge 3472 all of the tables 3473 """ 3474 tables = [self] + others 3475 3476 # gather all identifiers across tables 3477 all_features = reduce(or_, [set(t.ids(axis='observation')) 3478 for t in tables]) 3479 all_samples = reduce(or_, [set(t.ids()) for t in tables]) 3480 3481 # generate unique integer ids for the identifiers, and let's order 3482 # it to be polite 3483 feature_map = {i: idx for idx, i in enumerate(sorted(all_features))} 3484 sample_map = {i: idx for idx, i in enumerate(sorted(all_samples))} 3485 3486 # produce a new stable order 3487 get1 = lambda x: x[1] # noqa 3488 feature_order = [k for k, v in sorted(feature_map.items(), key=get1)] 3489 sample_order = [k for k, v in sorted(sample_map.items(), key=get1)] 3490 3491 mi = [] 3492 values = [] 3493 for table in tables: 3494 # these data are effectively [((row_index, col_index), value), ] 3495 data_as_dok = table.matrix_data.todok() 3496 3497 # construct a map of the feature integer index to what it is in 3498 # the full table 3499 feat_ids = table.ids(axis='observation') 3500 samp_ids = table.ids() 3501 table_features = {idx: feature_map[i] 3502 for idx, i in enumerate(feat_ids)} 3503 table_samples = {idx: sample_map[i] 3504 for idx, i in enumerate(samp_ids)} 3505 3506 for (f, s), v in data_as_dok.items(): 3507 # collect the indices and values, adjusting the indices as we 3508 # go 3509 mi.append((table_features[f], table_samples[s])) 3510 values.append(v) 3511 3512 # construct a multiindex of the indices where the outer index is the 3513 # feature and the inner index is the sample 3514 mi = pd.MultiIndex.from_tuples(mi) 3515 grouped = pd.Series(values, index=mi) 3516 3517 # aggregate the values where the outer and inner values in the 3518 # multiindex are the same 3519 collapsed_rcv = grouped.groupby(level=[0, 1]).sum() 3520 3521 # convert into a representation understood by the Table constructor 3522 list_list = [[r, c, v] for (r, c), v in collapsed_rcv.items()] 3523 3524 return self.__class__(list_list, feature_order, sample_order) 3525 3526 def merge(self, other, sample='union', observation='union', 3527 sample_metadata_f=prefer_self, 3528 observation_metadata_f=prefer_self): 3529 """Merge two tables together 3530 3531 The axes, samples and observations, can be controlled independently. 3532 Both can work on either "union" or "intersection". 3533 3534 `sample_metadata_f` and `observation_metadata_f` define how to 3535 merge metadata between tables. The default is to just keep the metadata 3536 associated to self if self has metadata otherwise take metadata from 3537 other. These functions are given both metadata dicts and must return 3538 a single metadata dict 3539 3540 Parameters 3541 ---------- 3542 other : biom.Table or Iterable of Table 3543 The other table to merge with this one. If an iterable, the tables 3544 are expected to not have metadata. 3545 sample : {'union', 'intersection'}, optional 3546 observation : {'union', 'intersection'}, optional 3547 sample_metadata_f : function, optional 3548 Defaults to ``biom.util.prefer_self``. Defines how to handle sample 3549 metadata during merge. 3550 obesrvation_metadata_f : function, optional 3551 Defaults to ``biom.util.prefer_self``. Defines how to handle 3552 observation metdata during merge. 3553 3554 Returns 3555 ------- 3556 biom.Table 3557 The merged table 3558 3559 Notes 3560 ----- 3561 - If ``sample_metadata_f`` and ``observation_metadata_f`` are None, 3562 then a fast merge is applied. 3563 - There is an implicit type conversion to ``float``. 3564 - The return type is always that of ``self`` 3565 3566 Examples 3567 -------- 3568 3569 >>> import numpy as np 3570 >>> from biom.table import Table 3571 3572 Create a 2x2 table and a 3x2 table: 3573 3574 >>> d_a = np.asarray([[2, 0], [6, 1]]) 3575 >>> t_a = Table(d_a, ['O1', 'O2'], ['S1', 'S2']) 3576 >>> d_b = np.asarray([[4, 5], [0, 3], [10, 10]]) 3577 >>> t_b = Table(d_b, ['O1', 'O2', 'O3'], ['S1', 'S2']) 3578 3579 Merging the table results in the overlapping samples/observations (see 3580 `O1` and `S2`) to be summed and the non-overlapping ones to be added to 3581 the resulting table (see `S3`). 3582 3583 >>> merged_table = t_a.merge(t_b) 3584 >>> print(merged_table) # doctest: +NORMALIZE_WHITESPACE 3585 # Constructed from biom file 3586 #OTU ID S1 S2 3587 O1 6.0 5.0 3588 O2 6.0 4.0 3589 O3 10.0 10.0 3590 3591 """ 3592 s_md = self.metadata() 3593 o_md = self.metadata(axis='observation') 3594 no_md = (s_md is None) and (o_md is None) 3595 ignore_md = (sample_metadata_f is None) and \ 3596 (observation_metadata_f is None) 3597 3598 if no_md or ignore_md: 3599 if sample == 'union' and observation == 'union': 3600 if isinstance(other, (list, set, tuple)): 3601 return self._fast_merge(other) 3602 else: 3603 return self._fast_merge([other, ]) 3604 3605 # determine the sample order in the resulting table 3606 if sample == 'union': 3607 new_samp_order = self._union_id_order(self.ids(), other.ids()) 3608 elif sample == 'intersection': 3609 new_samp_order = self._intersect_id_order(self.ids(), other.ids()) 3610 else: 3611 raise TableException("Unknown sample merge type: %s" % sample) 3612 3613 # determine the observation order in the resulting table 3614 if observation == 'union': 3615 new_obs_order = self._union_id_order( 3616 self.ids(axis='observation'), other.ids(axis='observation')) 3617 elif observation == 'intersection': 3618 new_obs_order = self._intersect_id_order( 3619 self.ids(axis='observation'), other.ids(axis='observation')) 3620 else: 3621 raise TableException( 3622 "Unknown observation merge type: %s" % 3623 observation) 3624 3625 # convert these to lists, no need to be dictionaries and reduces 3626 # calls to items() and allows for pre-caluculating insert order 3627 new_samp_order = sorted(new_samp_order.items(), key=itemgetter(1)) 3628 new_obs_order = sorted(new_obs_order.items(), key=itemgetter(1)) 3629 3630 # if we don't have any samples, complain loudly. This is likely from 3631 # performing an intersection without overlapping ids 3632 if not new_samp_order: 3633 raise TableException("No samples in resulting table!") 3634 if not new_obs_order: 3635 raise TableException("No observations in resulting table!") 3636 3637 # helper index lookups 3638 other_obs_idx = other._obs_index 3639 self_obs_idx = self._obs_index 3640 other_samp_idx = other._sample_index 3641 self_samp_idx = self._sample_index 3642 3643 # pre-calculate sample order from each table. We only need to do this 3644 # once which dramatically reduces the number of dict lookups necessary 3645 # within the inner loop 3646 other_samp_order = [] 3647 self_samp_order = [] 3648 for samp_id, nsi in new_samp_order: # nsi -> new_sample_index 3649 other_samp_order.append((nsi, other_samp_idx.get(samp_id, None))) 3650 self_samp_order.append((nsi, self_samp_idx.get(samp_id, None))) 3651 3652 # pre-allocate the a list for placing the resulting vectors as the 3653 # placement id is not ordered 3654 vals = [None for i in range(len(new_obs_order))] 3655 3656 # POSSIBLE DECOMPOSITION 3657 # resulting sample ids and sample metadata 3658 sample_ids = [] 3659 sample_md = [] 3660 self_sample_md = self.metadata() 3661 other_sample_md = other.metadata() 3662 for id_, idx in new_samp_order: 3663 sample_ids.append(id_) 3664 3665 # if we have sample metadata, grab it 3666 if self_sample_md is None or not self.exists(id_): 3667 self_md = None 3668 else: 3669 self_md = self_sample_md[self_samp_idx[id_]] 3670 3671 # if we have sample metadata, grab it 3672 if other_sample_md is None or not other.exists(id_): 3673 other_md = None 3674 else: 3675 other_md = other_sample_md[other_samp_idx[id_]] 3676 3677 sample_md.append(sample_metadata_f(self_md, other_md)) 3678 3679 # POSSIBLE DECOMPOSITION 3680 # resulting observation ids and sample metadata 3681 obs_ids = [] 3682 obs_md = [] 3683 self_obs_md = self.metadata(axis='observation') 3684 other_obs_md = other.metadata(axis='observation') 3685 for id_, idx in new_obs_order: 3686 obs_ids.append(id_) 3687 3688 # if we have observation metadata, grab it 3689 if self_obs_md is None or not self.exists(id_, axis="observation"): 3690 self_md = None 3691 else: 3692 self_md = self_obs_md[self_obs_idx[id_]] 3693 3694 # if we have observation metadata, grab it 3695 if other_obs_md is None or \ 3696 not other.exists(id_, axis="observation"): 3697 other_md = None 3698 else: 3699 other_md = other_obs_md[other_obs_idx[id_]] 3700 3701 obs_md.append(observation_metadata_f(self_md, other_md)) 3702 3703 # length used for construction of new vectors 3704 vec_length = len(new_samp_order) 3705 3706 # walk over observations in our new order 3707 for obs_id, new_obs_idx in new_obs_order: 3708 # create new vector for matrix values 3709 new_vec = zeros(vec_length, dtype='float') 3710 3711 # This method allows for the creation of a matrix of self type. 3712 # See note above 3713 # new_vec = data_f() 3714 3715 # see if the observation exists in other, if so, pull it out. 3716 # if not, set to the placeholder missing 3717 if other.exists(obs_id, axis="observation"): 3718 other_vec = other.data(obs_id, 'observation') 3719 else: 3720 other_vec = None 3721 3722 # see if the observation exists in self, if so, pull it out. 3723 # if not, set to the placeholder missing 3724 if self.exists(obs_id, axis="observation"): 3725 self_vec = self.data(obs_id, 'observation') 3726 else: 3727 self_vec = None 3728 3729 # short circuit. If other doesn't have any values, then we can just 3730 # take all values from self 3731 if other_vec is None: 3732 for (n_idx, s_idx) in self_samp_order: 3733 if s_idx is not None: 3734 new_vec[n_idx] = self_vec[s_idx] 3735 3736 # short circuit. If self doesn't have any values, then we can just 3737 # take all values from other 3738 elif self_vec is None: 3739 for (n_idx, o_idx) in other_samp_order: 3740 if o_idx is not None: 3741 new_vec[n_idx] = other_vec[o_idx] 3742 3743 else: 3744 # NOTE: DM 7.5.12, no observed improvement at the profile level 3745 # was made on this inner loop by using self_samp_order and 3746 # other_samp_order lists. 3747 3748 # walk over samples in our new order 3749 for samp_id, new_samp_idx in new_samp_order: 3750 # pull out each individual sample value. This is expensive, 3751 # but the vectors are in a different alignment. It is 3752 # possible that this could be improved with numpy take but 3753 # needs to handle missing values appropriately 3754 if samp_id not in self_samp_idx: 3755 self_vec_value = 0 3756 else: 3757 self_vec_value = self_vec[self_samp_idx[samp_id]] 3758 3759 if samp_id not in other_samp_idx: 3760 other_vec_value = 0 3761 else: 3762 other_vec_value = other_vec[other_samp_idx[samp_id]] 3763 3764 new_vec[new_samp_idx] = self_vec_value + other_vec_value 3765 3766 # convert our new vector to self type as to make sure we don't 3767 # accidently force a dense representation in memory 3768 vals[new_obs_idx] = self._conv_to_self_type(new_vec) 3769 3770 return self.__class__(self._conv_to_self_type(vals), obs_ids[:], 3771 sample_ids[:], obs_md, sample_md) 3772 3773 @classmethod 3774 def from_hdf5(cls, h5grp, ids=None, axis='sample', parse_fs=None, 3775 subset_with_metadata=True): 3776 """Parse an HDF5 formatted BIOM table 3777 3778 If ids is provided, only the samples/observations listed in ids 3779 (depending on the value of axis) will be loaded 3780 3781 The expected structure of this group is below. A few basic definitions, 3782 N is the number of observations and M is the number of samples. Data 3783 are stored in both compressed sparse row (for observation oriented 3784 operations) and compressed sparse column (for sample oriented 3785 operations). 3786 3787 Notes 3788 ----- 3789 The expected HDF5 group structure is below. An example of an HDF5 file 3790 in DDL can be found here [3]_. 3791 3792 - ./id : str, an \ 3793arbitrary ID 3794 - ./type : str, the \ 3795table type (e.g, OTU table) 3796 - ./format-url : str, a URL \ 3797that describes the format 3798 - ./format-version : two element \ 3799tuple of int32, major and minor 3800 - ./generated-by : str, what \ 3801generated this file 3802 - ./creation-date : str, ISO \ 3803format 3804 - ./shape : two element \ 3805tuple of int32, N by M 3806 - ./nnz : int32 or \ 3807int64, number of non zero elems 3808 - ./observation : Group 3809 - ./observation/ids : (N,) dataset\ 3810 of str or vlen str 3811 - ./observation/matrix : Group 3812 - ./observation/matrix/data : (nnz,) \ 3813dataset of float64 3814 - ./observation/matrix/indices : (nnz,) \ 3815dataset of int32 3816 - ./observation/matrix/indptr : (M+1,) \ 3817dataset of int32 3818 - ./observation/metadata : Group 3819 - [./observation/metadata/foo] : Optional, \ 3820(N,) dataset of any valid HDF5 type in index order with IDs. 3821 - ./observation/group-metadata : Group 3822 - [./observation/group-metadata/foo] : Optional, \ 3823(?,) dataset of group metadata that relates IDs 3824 - [./observation/group-metadata/foo.attrs['data_type']] : attribute of\ 3825 the foo dataset that describes contained type (e.g., newick) 3826 - ./sample : Group 3827 - ./sample/ids : (M,) dataset\ 3828 of str or vlen str 3829 - ./sample/matrix : Group 3830 - ./sample/matrix/data : (nnz,) \ 3831dataset of float64 3832 - ./sample/matrix/indices : (nnz,) \ 3833dataset of int32 3834 - ./sample/matrix/indptr : (N+1,) \ 3835dataset of int32 3836 - ./sample/metadata : Group 3837 - [./sample/metadata/foo] : Optional, \ 3838(M,) dataset of any valid HDF5 type in index order with IDs. 3839 - ./sample/group-metadata : Group 3840 - [./sample/group-metadata/foo] : Optional, \ 3841(?,) dataset of group metadata that relates IDs 3842 - [./sample/group-metadata/foo.attrs['data_type']] : attribute of\ 3843 the foo dataset that describes contained type (e.g., newick) 3844 3845 The '?' character on the dataset size means that it can be of arbitrary 3846 length. 3847 3848 The expected structure for each of the metadata datasets is a list of 3849 atomic type objects (int, float, str, ...), where the index order of 3850 the list corresponds to the index order of the relevant axis IDs. 3851 Special metadata fields have been defined, and they are stored in a 3852 specific way. Currently, the available special metadata fields are: 3853 3854 - taxonomy: (N, ?) dataset of str or vlen str 3855 - KEGG_Pathways: (N, ?) dataset of str or vlen str 3856 - collapsed_ids: (N, ?) dataset of str or vlen str 3857 3858 Parameters 3859 ---------- 3860 h5grp : a h5py ``Group`` or an open h5py ``File`` 3861 ids : iterable 3862 The sample/observation ids of the samples/observations that we need 3863 to retrieve from the hdf5 biom table 3864 axis : {'sample', 'observation'}, optional 3865 The axis to subset on 3866 parse_fs : dict, optional 3867 Specify custom parsing functions for metadata fields. This dict is 3868 expected to be {'metadata_field': function}, where the function 3869 signature is (object) corresponding to a single row in the 3870 associated metadata dataset. The return from this function an 3871 object as well, and is the parsed representation of the metadata. 3872 subset_with_metadata : bool, optional 3873 When subsetting (i.e., `ids` is `not None`), whether to also parse 3874 the metadata. By default, the metadata are also subset. The reason 3875 for exposing this functionality is that, for large tables, there 3876 exists a very large overhead for this metadata manipulation. 3877 3878 Returns 3879 ------- 3880 biom.Table 3881 A BIOM ``Table`` object 3882 3883 Raises 3884 ------ 3885 ValueError 3886 If `ids` are not a subset of the samples or observations ids 3887 present in the hdf5 biom table 3888 If h5grp is not a HDF5 file or group 3889 3890 References 3891 ---------- 3892 .. [1] http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/sci\ 3893py.sparse.csr_matrix.html 3894 .. [2] http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/sci\ 3895py.sparse.csc_matrix.html 3896 .. [3] http://biom-format.org/documentation/format_versions/biom-2.0.\ 3897html 3898 3899 See Also 3900 -------- 3901 Table.to_hdf5 3902 3903 Examples 3904 -------- 3905 >>> from biom.table import Table 3906 >>> from biom.util import biom_open 3907 >>> with biom_open('rich_sparse_otu_table_hdf5.biom') as f \ 3908# doctest: +SKIP 3909 >>> t = Table.from_hdf5(f) # doctest: +SKIP 3910 3911 Parse a hdf5 biom table subsetting observations 3912 >>> from biom.util import biom_open # doctest: +SKIP 3913 >>> from biom.parse import parse_biom_table 3914 >>> with biom_open('rich_sparse_otu_table_hdf5.biom') as f \ 3915# doctest: +SKIP 3916 >>> t = Table.from_hdf5(f, ids=["GG_OTU_1"], 3917 ... axis='observation') # doctest: +SKIP 3918 """ 3919 if not HAVE_H5PY: 3920 raise RuntimeError("h5py is not in the environment, HDF5 support " 3921 "is not available") 3922 3923 import h5py 3924 if not isinstance(h5grp, (h5py.Group, h5py.File)): 3925 raise ValueError("h5grp does not appear to be an HDF5 file or " 3926 "group") 3927 3928 if axis not in ['sample', 'observation']: 3929 raise UnknownAxisError(axis) 3930 3931 if parse_fs is None: 3932 parse_fs = {} 3933 3934 if not subset_with_metadata and ids is not None: 3935 ids = set(ids) 3936 3937 raw_indices = h5grp['%s/matrix/indices' % axis] 3938 raw_indptr = h5grp['%s/matrix/indptr' % axis] 3939 raw_data = h5grp['%s/matrix/data' % axis] 3940 axis_ids = h5grp['%s/ids' % axis][:] 3941 3942 to_keep = np.array([i for i, id_ in enumerate(axis_ids) 3943 if id_ in ids], dtype=int) 3944 start_end = [(raw_indptr[i], raw_indptr[i+1]) for i in to_keep] 3945 indptr = np.empty(len(to_keep) + 1, dtype=np.int32) 3946 indptr[0] = 0 3947 indptr[1:] = np.array([e - s for s, e in start_end]).cumsum() 3948 data = np.concatenate([raw_data[s:e] for s, e in start_end]) 3949 indices = np.concatenate([raw_indices[s:e] for s, e in start_end]) 3950 3951 if axis == 'sample': 3952 obs_ids = h5grp['observation/ids'][:] 3953 samp_ids = axis_ids[to_keep] 3954 shape = (len(obs_ids), len(to_keep)) 3955 mat = csc_matrix((data, indices, indptr), shape=shape) 3956 else: 3957 samp_ids = h5grp['sample/ids'][:] 3958 obs_ids = axis_ids[to_keep] 3959 shape = (len(to_keep), len(samp_ids)) 3960 mat = csr_matrix((data, indices, indptr), shape=shape) 3961 3962 return Table(mat, obs_ids, samp_ids) 3963 3964 id_ = h5grp.attrs['id'] 3965 create_date = h5grp.attrs['creation-date'] 3966 generated_by = h5grp.attrs['generated-by'] 3967 3968 shape = h5grp.attrs['shape'] 3969 type_ = None if h5grp.attrs['type'] == '' else h5grp.attrs['type'] 3970 3971 if isinstance(id_, six.binary_type): 3972 if six.PY3: 3973 id_ = id_.decode('ascii') 3974 else: 3975 id_ = str(id_) 3976 3977 if isinstance(type_, six.binary_type): 3978 if six.PY3: 3979 type_ = type_.decode('ascii') 3980 else: 3981 type_ = str(type_) 3982 3983 def axis_load(grp): 3984 """Loads all the data of the given group""" 3985 # fetch all of the IDs 3986 ids = grp['ids'][:] 3987 3988 if ids.size > 0 and isinstance(ids[0], bytes): 3989 ids = np.array([i.decode('utf8') for i in ids]) 3990 3991 parser = defaultdict(lambda: general_parser) 3992 parser['taxonomy'] = vlen_list_of_str_parser 3993 parser['KEGG_Pathways'] = vlen_list_of_str_parser 3994 parser['collapsed_ids'] = vlen_list_of_str_parser 3995 parser.update(parse_fs) 3996 3997 # fetch ID specific metadata 3998 md = [{} for i in range(len(ids))] 3999 for category, dset in viewitems(grp['metadata']): 4000 parse_f = parser[category] 4001 data = dset[:] 4002 for md_dict, data_row in zip(md, data): 4003 md_dict[category] = parse_f(data_row) 4004 4005 # If there was no metadata on the axis, set it up as none 4006 md = md if any(md) else None 4007 4008 # Fetch the group metadata 4009 grp_md = {cat: val 4010 for cat, val in grp['group-metadata'].items()} 4011 return ids, md, grp_md 4012 4013 obs_ids, obs_md, obs_grp_md = axis_load(h5grp['observation']) 4014 samp_ids, samp_md, samp_grp_md = axis_load(h5grp['sample']) 4015 4016 # load the data 4017 data_grp = h5grp[axis]['matrix'] 4018 h5_data = data_grp["data"] 4019 h5_indices = data_grp["indices"] 4020 h5_indptr = data_grp["indptr"] 4021 4022 # Check if we need to subset the biom table 4023 if ids is not None: 4024 def _get_ids(source_ids, desired_ids): 4025 """If desired_ids is not None, makes sure that it is a subset 4026 of source_ids and returns the desired_ids array-like and a 4027 boolean array indicating where the desired_ids can be found in 4028 source_ids""" 4029 if desired_ids is None: 4030 ids = source_ids[:] 4031 idx = np.ones(source_ids.shape, dtype=bool) 4032 else: 4033 desired_ids = np.asarray(desired_ids) 4034 # Get the index of the source ids to include 4035 idx = np.in1d(source_ids, desired_ids) 4036 # Retrieve only the ids that we are interested on 4037 ids = source_ids[idx] 4038 # Check that all desired ids have been found on source ids 4039 4040 if ids.shape != desired_ids.shape: 4041 raise ValueError("The following ids could not be " 4042 "found in the biom table: %s" % 4043 (set(desired_ids) - set(ids))) 4044 return ids, idx 4045 4046 # Get the observation and sample ids that we are interested in 4047 samp, obs = (ids, None) if axis == 'sample' else (None, ids) 4048 obs_ids, obs_idx = _get_ids(obs_ids, obs) 4049 samp_ids, samp_idx = _get_ids(samp_ids, samp) 4050 4051 # Get the new matrix shape 4052 shape = (len(obs_ids), len(samp_ids)) 4053 4054 # Fetch the metadata that we are interested in 4055 def _subset_metadata(md, idx): 4056 """If md has data, returns the subset indicated by idx, a 4057 boolean array""" 4058 if md: 4059 md = list(np.asarray(md)[np.where(idx)]) 4060 return md 4061 4062 obs_md = _subset_metadata(obs_md, obs_idx) 4063 samp_md = _subset_metadata(samp_md, samp_idx) 4064 4065 # load the subset of the data 4066 idx = samp_idx if axis == 'sample' else obs_idx 4067 keep = np.where(idx)[0] 4068 indptr_indices = sorted([(h5_indptr[i], h5_indptr[i+1]) 4069 for i in keep]) 4070 # Create the new indptr 4071 indptr_subset = np.array([end - start 4072 for start, end in indptr_indices]) 4073 indptr = np.empty(len(keep) + 1, dtype=np.int32) 4074 indptr[0] = 0 4075 indptr[1:] = indptr_subset.cumsum() 4076 4077 data = np.hstack(h5_data[start:end] 4078 for start, end in indptr_indices) 4079 indices = np.hstack(h5_indices[start:end] 4080 for start, end in indptr_indices) 4081 else: 4082 # no subset need, just pass all data to scipy 4083 data = h5_data 4084 indices = h5_indices 4085 indptr = h5_indptr 4086 4087 cs = (data, indices, indptr) 4088 4089 if axis == 'sample': 4090 matrix = csc_matrix(cs, shape=shape) 4091 else: 4092 matrix = csr_matrix(cs, shape=shape) 4093 4094 t = Table(matrix, obs_ids, samp_ids, obs_md or None, 4095 samp_md or None, type=type_, create_date=create_date, 4096 generated_by=generated_by, table_id=id_, 4097 observation_group_metadata=obs_grp_md, 4098 sample_group_metadata=samp_grp_md) 4099 4100 if ids is not None: 4101 # filter out any empty samples or observations which may exist due 4102 # to subsetting 4103 def any_value(vals, id_, md): 4104 return np.any(vals) 4105 4106 axis = 'observation' if axis == 'sample' else 'sample' 4107 t.filter(any_value, axis=axis) 4108 4109 return t 4110 4111 def to_dataframe(self, dense=False): 4112 """Convert matrix data to a Pandas SparseDataFrame or DataFrame 4113 4114 Parameters 4115 ---------- 4116 dense : bool, optional 4117 If True, return pd.DataFrame instead of pd.SparseDataFrame. 4118 4119 Returns 4120 ------- 4121 pd.DataFrame or pd.SparseDataFrame 4122 A DataFrame indexed on the observation IDs, with the column 4123 names as the sample IDs. 4124 4125 Notes 4126 ----- 4127 Metadata are not included. 4128 4129 Examples 4130 -------- 4131 >>> from biom import example_table 4132 >>> df = example_table.to_dataframe() 4133 >>> df 4134 S1 S2 S3 4135 O1 0.0 1.0 2.0 4136 O2 3.0 4.0 5.0 4137 """ 4138 index = self.ids(axis='observation') 4139 columns = self.ids() 4140 4141 if dense: 4142 mat = self.matrix_data.toarray() 4143 constructor = pd.DataFrame 4144 else: 4145 mat = self.matrix_data.copy() 4146 constructor = partial(pd.DataFrame.sparse.from_spmatrix) 4147 4148 return constructor(mat, index=index, columns=columns) 4149 4150 def to_anndata(self, dense=False, dtype="float32", transpose=True): 4151 """Convert Table to AnnData format 4152 4153 Parameters 4154 ---------- 4155 dense : bool, optional 4156 If True, set adata.X as np.ndarray instead of sparse matrix. 4157 dtype: str, optional 4158 dtype used for storage in anndata object. 4159 tranpose: bool, optional 4160 If True, transpose the anndata so that observations are columns 4161 4162 Returns 4163 ------- 4164 anndata.AnnData 4165 AnnData with matrix data and associated observation and 4166 sample metadata. 4167 4168 Notes 4169 ----- 4170 Nested metadata are not included. 4171 4172 Examples 4173 -------- 4174 >>> from biom import example_table 4175 >>> adata = example_table.to_anndata() 4176 >>> adata 4177 AnnData object with n_obs × n_vars = 3 × 2 4178 obs: 'environment' 4179 var: 'taxonomy_0', 'taxonomy_1' 4180 """ 4181 try: 4182 import anndata 4183 except ImportError: 4184 raise ImportError( 4185 "Please install anndata package -- `pip install anndata`" 4186 ) 4187 mat = self.matrix_data 4188 4189 if dense: 4190 mat = mat.toarray() 4191 4192 var = self.metadata_to_dataframe("sample") 4193 obs = self.metadata_to_dataframe("observation") 4194 4195 adata = anndata.AnnData(mat, obs=obs, var=var, dtype=dtype) 4196 # Convention for scRNA-seq analysis in Python 4197 adata = adata.transpose() 4198 4199 return adata 4200 4201 def metadata_to_dataframe(self, axis): 4202 """Convert axis metadata to a Pandas DataFrame 4203 4204 Parameters 4205 ---------- 4206 axis : {'sample', 'observation'} 4207 The axis to operate on. 4208 4209 Returns 4210 ------- 4211 pd.DataFrame 4212 A DataFrame indexed by the ids of the desired axis, columns by the 4213 metadata keys over that axis. 4214 4215 Raises 4216 ------ 4217 UnknownAxisError 4218 If the requested axis isn't recognized 4219 KeyError 4220 IF the requested axis does not have metadata 4221 TypeError 4222 If a metadata column is a list or tuple, but is jagged over the 4223 axis. 4224 4225 Notes 4226 ----- 4227 Nested metadata (e.g., KEGG_Pathways) is not supported. 4228 4229 Metadata which are lists or tuples (e.g., taxonomy) are expanded such 4230 that each index position is a unique column. For instance, the key 4231 taxonomy will become "taxonomy_0", "taxonomy_1", etc where "taxonomy_0" 4232 corresponds to the 0th index position of the taxonomy. 4233 4234 Examples 4235 -------- 4236 >>> from biom import example_table 4237 >>> example_table.metadata_to_dataframe('observation') 4238 taxonomy_0 taxonomy_1 4239 O1 Bacteria Firmicutes 4240 O2 Bacteria Bacteroidetes 4241 """ 4242 md = self.metadata(axis=axis) 4243 if md is None: 4244 raise KeyError("%s does not have metadata" % axis) 4245 4246 test = md[0] 4247 kv_test = sorted(list(test.items())) 4248 columns = [] 4249 expand = {} 4250 for key, value in kv_test: 4251 if isinstance(value, (tuple, list)): 4252 expand[key] = True 4253 for idx in range(len(value)): 4254 columns.append("%s_%d" % (key, idx)) 4255 else: 4256 expand[key] = False 4257 columns.append(key) 4258 4259 rows = [] 4260 for m in md: 4261 row = [] 4262 for key, value in sorted(m.items()): 4263 if expand[key]: 4264 for v in value: 4265 row.append(v) 4266 else: 4267 row.append(value) 4268 rows.append(row) 4269 4270 return pd.DataFrame(rows, index=self.ids(axis=axis), columns=columns) 4271 4272 def to_hdf5(self, h5grp, generated_by, compress=True, format_fs=None): 4273 """Store CSC and CSR in place 4274 4275 The resulting structure of this group is below. A few basic 4276 definitions, N is the number of observations and M is the number of 4277 samples. Data are stored in both compressed sparse row [1]_ (CSR, for 4278 observation oriented operations) and compressed sparse column [2]_ 4279 (CSC, for sample oriented operations). 4280 4281 Notes 4282 ----- 4283 The expected HDF5 group structure is below. An example of an HDF5 file 4284 in DDL can be found here [3]_. 4285 4286 - ./id : str, an \ 4287arbitrary ID 4288 - ./type : str, the \ 4289table type (e.g, OTU table) 4290 - ./format-url : str, a URL \ 4291that describes the format 4292 - ./format-version : two element \ 4293tuple of int32, major and minor 4294 - ./generated-by : str, what \ 4295generated this file 4296 - ./creation-date : str, ISO \ 4297format 4298 - ./shape : two element \ 4299tuple of int32, N by M 4300 - ./nnz : int32 or \ 4301int64, number of non zero elems 4302 - ./observation : Group 4303 - ./observation/ids : (N,) dataset\ 4304 of str or vlen str 4305 - ./observation/matrix : Group 4306 - ./observation/matrix/data : (nnz,) \ 4307dataset of float64 4308 - ./observation/matrix/indices : (nnz,) \ 4309dataset of int32 4310 - ./observation/matrix/indptr : (M+1,) \ 4311dataset of int32 4312 - ./observation/metadata : Group 4313 - [./observation/metadata/foo] : Optional, \ 4314(N,) dataset of any valid HDF5 type in index order with IDs. 4315 - ./observation/group-metadata : Group 4316 - [./observation/group-metadata/foo] : Optional, \ 4317(?,) dataset of group metadata that relates IDs 4318 - [./observation/group-metadata/foo.attrs['data_type']] : attribute of\ 4319 the foo dataset that describes contained type (e.g., newick) 4320 - ./sample : Group 4321 - ./sample/ids : (M,) dataset\ 4322 of str or vlen str 4323 - ./sample/matrix : Group 4324 - ./sample/matrix/data : (nnz,) \ 4325dataset of float64 4326 - ./sample/matrix/indices : (nnz,) \ 4327dataset of int32 4328 - ./sample/matrix/indptr : (N+1,) \ 4329dataset of int32 4330 - ./sample/metadata : Group 4331 - [./sample/metadata/foo] : Optional, \ 4332(M,) dataset of any valid HDF5 type in index order with IDs. 4333 - ./sample/group-metadata : Group 4334 - [./sample/group-metadata/foo] : Optional, \ 4335(?,) dataset of group metadata that relates IDs 4336 - [./sample/group-metadata/foo.attrs['data_type']] : attribute of\ 4337 the foo dataset that describes contained type (e.g., newick) 4338 4339 The '?' character on the dataset size means that it can be of arbitrary 4340 length. 4341 4342 The expected structure for each of the metadata datasets is a list of 4343 atomic type objects (int, float, str, ...), where the index order of 4344 the list corresponds to the index order of the relevant axis IDs. 4345 Special metadata fields have been defined, and they are stored in a 4346 specific way. Currently, the available special metadata fields are: 4347 4348 - taxonomy: (N, ?) dataset of str or vlen str 4349 - KEGG_Pathways: (N, ?) dataset of str or vlen str 4350 - collapsed_ids: (N, ?) dataset of str or vlen str 4351 4352 Parameters 4353 ---------- 4354 h5grp : `h5py.Group` or `h5py.File` 4355 The HDF5 entity in which to write the BIOM formatted data. 4356 generated_by : str 4357 A description of what generated the table 4358 compress : bool, optional 4359 Defaults to ``True`` means fields will be compressed with gzip, 4360 ``False`` means no compression 4361 format_fs : dict, optional 4362 Specify custom formatting functions for metadata fields. This dict 4363 is expected to be {'metadata_field': function}, where the function 4364 signature is (h5py.Group, str, dict, bool) corresponding to the 4365 specific HDF5 group the metadata dataset will be associated with, 4366 the category being operated on, the metadata for the entire axis 4367 being operated on, and whether to enable compression on the 4368 dataset. Anything returned by this function is ignored. 4369 4370 Notes 4371 ----- 4372 This method does not return anything and operates in place on h5grp. 4373 4374 See Also 4375 -------- 4376 Table.from_hdf5 4377 4378 References 4379 ---------- 4380 .. [1] http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/sci\ 4381py.sparse.csr_matrix.html 4382 .. [2] http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/sci\ 4383py.sparse.csc_matrix.html 4384 .. [3] http://biom-format.org/documentation/format_versions/biom-2.1.\ 4385html 4386 4387 Examples 4388 -------- 4389 >>> from biom.util import biom_open # doctest: +SKIP 4390 >>> from biom.table import Table 4391 >>> from numpy import array 4392 >>> t = Table(array([[1, 2], [3, 4]]), ['a', 'b'], ['x', 'y']) 4393 >>> with biom_open('foo.biom', 'w') as f: # doctest: +SKIP 4394 ... t.to_hdf5(f, "example") 4395 4396 """ 4397 if not HAVE_H5PY: 4398 raise RuntimeError("h5py is not in the environment, HDF5 support " 4399 "is not available") 4400 4401 if format_fs is None: 4402 format_fs = {} 4403 4404 h5grp.attrs['id'] = self.table_id if self.table_id else "No Table ID" 4405 h5grp.attrs['type'] = self.type if self.type else "" 4406 h5grp.attrs['format-url'] = "http://biom-format.org" 4407 h5grp.attrs['format-version'] = self.format_version 4408 h5grp.attrs['generated-by'] = generated_by 4409 h5grp.attrs['creation-date'] = datetime.now().isoformat() 4410 h5grp.attrs['shape'] = self.shape 4411 h5grp.attrs['nnz'] = self.nnz 4412 4413 compression = None 4414 if compress is True: 4415 compression = 'gzip' 4416 4417 formatter = defaultdict(lambda: general_formatter) 4418 formatter['taxonomy'] = vlen_list_of_str_formatter 4419 formatter['KEGG_Pathways'] = vlen_list_of_str_formatter 4420 formatter['collapsed_ids'] = vlen_list_of_str_formatter 4421 formatter.update(format_fs) 4422 4423 for axis, order in zip(['observation', 'sample'], ['csr', 'csc']): 4424 grp = h5grp.create_group(axis) 4425 4426 self._data = self._data.asformat(order) 4427 4428 ids = self.ids(axis=axis) 4429 len_ids = len(ids) 4430 len_indptr = len(self._data.indptr) 4431 len_data = self.nnz 4432 4433 md = self.metadata(axis=axis) 4434 4435 # Create the group for the metadata 4436 grp.create_group('metadata') 4437 if md: 4438 exp = set(md[0]) 4439 for other_id, other_md in zip(ids[1:], md[1:]): 4440 if set(other_md) != exp: 4441 raise ValueError("%s has inconsistent metadata " 4442 "categories with %s:\n" 4443 "%s: %s\n" 4444 "%s: %s" % (other_id, ids[0], 4445 other_id, list(other_md), 4446 ids[0], list(exp))) 4447 4448 for category in md[0]: 4449 # Create the dataset for the current category, 4450 # putting values in id order 4451 formatter[category](grp, category, md, compression) 4452 4453 group_md = self.group_metadata(axis) 4454 4455 # Create the group for the group metadata 4456 grp.create_group('group-metadata') 4457 4458 if group_md: 4459 for key, value in group_md.items(): 4460 datatype, val = value 4461 grp_dataset = grp.create_dataset( 4462 'group-metadata/%s' % key, 4463 shape=(1,), dtype=H5PY_VLEN_STR, 4464 data=val, compression=compression) 4465 grp_dataset.attrs['data_type'] = datatype 4466 4467 grp.create_group('matrix') 4468 grp.create_dataset('matrix/data', shape=(len_data,), 4469 dtype=np.float64, 4470 data=self._data.data, 4471 compression=compression) 4472 grp.create_dataset('matrix/indices', shape=(len_data,), 4473 dtype=np.int32, 4474 data=self._data.indices, 4475 compression=compression) 4476 grp.create_dataset('matrix/indptr', shape=(len_indptr,), 4477 dtype=np.int32, 4478 data=self._data.indptr, 4479 compression=compression) 4480 4481 if len_ids > 0: 4482 # if we store IDs in the table as numpy arrays then this store 4483 # is cleaner, as is the parse 4484 grp.create_dataset('ids', shape=(len_ids,), 4485 dtype=H5PY_VLEN_STR, 4486 data=[i.encode('utf8') for i in ids], 4487 compression=compression) 4488 else: 4489 # Empty H5PY_VLEN_STR datasets are not supported. 4490 grp.create_dataset('ids', shape=(0, ), data=[], 4491 compression=compression) 4492 4493 @classmethod 4494 def from_json(self, json_table, data_pump=None, 4495 input_is_dense=False): 4496 """Parse a biom otu table type 4497 4498 Parameters 4499 ---------- 4500 json_table : dict 4501 A JSON object or dict that represents the BIOM table 4502 data_pump : tuple or None 4503 A secondary source of data 4504 input_is_dense : bool 4505 If `True`, the data contained will be interpretted as dense 4506 4507 Returns 4508 ------- 4509 Table 4510 4511 Examples 4512 -------- 4513 >>> from biom import Table 4514 >>> json_obj = {"id": "None", 4515 ... "format": "Biological Observation Matrix 1.0.0", 4516 ... "format_url": "http://biom-format.org", 4517 ... "generated_by": "foo", 4518 ... "type": "OTU table", 4519 ... "date": "2014-06-03T14:24:40.884420", 4520 ... "matrix_element_type": "float", 4521 ... "shape": [5, 6], 4522 ... "data": [[0,2,1.0], 4523 ... [1,0,5.0], 4524 ... [1,1,1.0], 4525 ... [1,3,2.0], 4526 ... [1,4,3.0], 4527 ... [1,5,1.0], 4528 ... [2,2,1.0], 4529 ... [2,3,4.0], 4530 ... [2,5,2.0], 4531 ... [3,0,2.0], 4532 ... [3,1,1.0], 4533 ... [3,2,1.0], 4534 ... [3,5,1.0], 4535 ... [4,1,1.0], 4536 ... [4,2,1.0]], 4537 ... "rows": [{"id": "GG_OTU_1", "metadata": None}, 4538 ... {"id": "GG_OTU_2", "metadata": None}, 4539 ... {"id": "GG_OTU_3", "metadata": None}, 4540 ... {"id": "GG_OTU_4", "metadata": None}, 4541 ... {"id": "GG_OTU_5", "metadata": None}], 4542 ... "columns": [{"id": "Sample1", "metadata": None}, 4543 ... {"id": "Sample2", "metadata": None}, 4544 ... {"id": "Sample3", "metadata": None}, 4545 ... {"id": "Sample4", "metadata": None}, 4546 ... {"id": "Sample5", "metadata": None}, 4547 ... {"id": "Sample6", "metadata": None}] 4548 ... } 4549 >>> t = Table.from_json(json_obj) 4550 4551 """ 4552 sample_ids = [col['id'] for col in json_table['columns']] 4553 sample_metadata = [col['metadata'] for col in json_table['columns']] 4554 obs_ids = [row['id'] for row in json_table['rows']] 4555 obs_metadata = [row['metadata'] for row in json_table['rows']] 4556 dtype = MATRIX_ELEMENT_TYPE[json_table['matrix_element_type']] 4557 if 'matrix_type' in json_table: 4558 if json_table['matrix_type'] == 'dense': 4559 input_is_dense = True 4560 else: 4561 input_is_dense = False 4562 type_ = json_table['type'] 4563 4564 if data_pump is None: 4565 table_obj = Table(json_table['data'], obs_ids, sample_ids, 4566 obs_metadata, sample_metadata, 4567 shape=json_table['shape'], 4568 dtype=dtype, 4569 type=type_, 4570 generated_by=json_table['generated_by'], 4571 input_is_dense=input_is_dense) 4572 else: 4573 table_obj = Table(data_pump, obs_ids, sample_ids, 4574 obs_metadata, sample_metadata, 4575 shape=json_table['shape'], 4576 dtype=dtype, 4577 type=type_, 4578 generated_by=json_table['generated_by'], 4579 input_is_dense=input_is_dense) 4580 return table_obj 4581 4582 def to_json(self, generated_by, direct_io=None): 4583 """Returns a JSON string representing the table in BIOM format. 4584 4585 Parameters 4586 ---------- 4587 generated_by : str 4588 a string describing the software used to build the table 4589 direct_io : file or file-like object, optional 4590 Defaults to ``None``. Must implementing a ``write`` function. If 4591 `direct_io` is not ``None``, the final output is written directly 4592 to `direct_io` during processing. 4593 4594 Returns 4595 ------- 4596 str 4597 A JSON-formatted string representing the biom table 4598 """ 4599 if not isinstance(generated_by, string_types): 4600 raise TableException("Must specify a generated_by string") 4601 4602 # Fill in top-level metadata. 4603 if direct_io: 4604 direct_io.write(u'{') 4605 direct_io.write(u'"id": "%s",' % str(self.table_id)) 4606 direct_io.write( 4607 u'"format": "%s",' % 4608 get_biom_format_version_string((1, 0))) # JSON table -> 1.0.0 4609 direct_io.write( 4610 u'"format_url": "%s",' % 4611 get_biom_format_url_string()) 4612 direct_io.write(u'"generated_by": "%s",' % generated_by) 4613 direct_io.write(u'"date": "%s",' % datetime.now().isoformat()) 4614 else: 4615 id_ = u'"id": "%s",' % str(self.table_id) 4616 format_ = u'"format": "%s",' % get_biom_format_version_string( 4617 (1, 0)) # JSON table -> 1.0.0 4618 format_url = u'"format_url": "%s",' % get_biom_format_url_string() 4619 generated_by = u'"generated_by": "%s",' % generated_by 4620 date = u'"date": "%s",' % datetime.now().isoformat() 4621 4622 # Determine if we have any data in the matrix, and what the shape of 4623 # the matrix is. 4624 try: 4625 num_rows, num_cols = self.shape 4626 except: # noqa 4627 num_rows = num_cols = 0 4628 has_data = True if num_rows > 0 and num_cols > 0 else False 4629 4630 # Default the matrix element type to test to be an integer in case we 4631 # don't have any data in the matrix to test. 4632 test_element = 0 4633 if has_data: 4634 test_element = self[0, 0] 4635 4636 # Determine the type of elements the matrix is storing. 4637 if isinstance(test_element, int): 4638 matrix_element_type = u"int" 4639 elif isinstance(test_element, float): 4640 matrix_element_type = u"float" 4641 elif isinstance(test_element, string_types): 4642 matrix_element_type = u"str" 4643 else: 4644 raise TableException("Unsupported matrix data type.") 4645 4646 # Fill in details about the matrix. 4647 if direct_io: 4648 direct_io.write( 4649 u'"matrix_element_type": "%s",' % 4650 matrix_element_type) 4651 direct_io.write(u'"shape": [%d, %d],' % (num_rows, num_cols)) 4652 else: 4653 matrix_element_type = u'"matrix_element_type": "%s",' % \ 4654 matrix_element_type 4655 shape = u'"shape": [%d, %d],' % (num_rows, num_cols) 4656 4657 # Fill in the table type 4658 if self.type is None: 4659 type_ = u'"type": null,' 4660 else: 4661 type_ = u'"type": "%s",' % self.type 4662 4663 if direct_io: 4664 direct_io.write(type_) 4665 4666 # Fill in details about the rows in the table and fill in the matrix's 4667 # data. BIOM 2.0+ is now only sparse 4668 if direct_io: 4669 direct_io.write(u'"matrix_type": "sparse",') 4670 direct_io.write(u'"data": [') 4671 else: 4672 matrix_type = u'"matrix_type": "sparse",' 4673 data = [u'"data": ['] 4674 4675 max_row_idx = len(self.ids(axis='observation')) - 1 4676 max_col_idx = len(self.ids()) - 1 4677 rows = [u'"rows": ['] 4678 have_written = False 4679 for obs_index, obs in enumerate(self.iter(axis='observation')): 4680 # i'm crying on the inside 4681 if obs_index != max_row_idx: 4682 rows.append(u'{"id": %s, "metadata": %s},' % (dumps(obs[1]), 4683 dumps(obs[2]))) 4684 else: 4685 rows.append(u'{"id": %s, "metadata": %s}],' % (dumps(obs[1]), 4686 dumps(obs[2]))) 4687 4688 # turns out its a pain to figure out when to place commas. the 4689 # simple work around, at the expense of a little memory 4690 # (bound by the number of samples) is to build of what will be 4691 # written, and then add in the commas where necessary. 4692 built_row = [] 4693 for col_index, val in enumerate(obs[0]): 4694 if float(val) != 0.0: 4695 built_row.append(u"[%d,%d,%r]" % (obs_index, col_index, 4696 val)) 4697 if built_row: 4698 # if we have written a row already, its safe to add a comma 4699 if have_written: 4700 if direct_io: 4701 direct_io.write(u',') 4702 else: 4703 data.append(u',') 4704 if direct_io: 4705 direct_io.write(u','.join(built_row)) 4706 else: 4707 data.append(u','.join(built_row)) 4708 4709 have_written = True 4710 4711 # finalize the data block 4712 if direct_io: 4713 direct_io.write(u"],") 4714 else: 4715 data.append(u"],") 4716 4717 # Fill in details about the columns in the table. 4718 columns = [u'"columns": ['] 4719 for samp_index, samp in enumerate(self.iter()): 4720 if samp_index != max_col_idx: 4721 columns.append(u'{"id": %s, "metadata": %s},' % ( 4722 dumps(samp[1]), dumps(samp[2]))) 4723 else: 4724 columns.append(u'{"id": %s, "metadata": %s}]' % ( 4725 dumps(samp[1]), dumps(samp[2]))) 4726 4727 if rows[0] == u'"rows": [' and len(rows) == 1: 4728 # empty table case 4729 rows = [u'"rows": [],'] 4730 columns = [u'"columns": []'] 4731 4732 rows = u''.join(rows) 4733 columns = u''.join(columns) 4734 4735 if direct_io: 4736 direct_io.write(rows) 4737 direct_io.write(columns) 4738 direct_io.write(u'}') 4739 else: 4740 return u"{%s}" % ''.join([id_, format_, format_url, matrix_type, 4741 generated_by, date, type_, 4742 matrix_element_type, shape, 4743 u''.join(data), rows, columns]) 4744 4745 @staticmethod 4746 def from_adjacency(lines): 4747 """Parse an adjacency format into BIOM 4748 4749 Parameters 4750 ---------- 4751 lines : list, str, or file-like object 4752 The tab delimited data to parse 4753 4754 Returns 4755 ------- 4756 biom.Table 4757 A BIOM ``Table`` object 4758 4759 Notes 4760 ----- 4761 The input is expected to be of the form: observation, sample, value. A 4762 header is not required, but if present, it must be of the form: 4763 4764 #OTU ID<tab>SampleID<tab>value 4765 4766 Raises 4767 ------ 4768 ValueError 4769 If the input is not an iterable or file-like object. 4770 ValueError 4771 If the data is incorrectly formatted. 4772 4773 Examples 4774 -------- 4775 Parse tab separated adjacency data into a table: 4776 4777 >>> from biom.table import Table 4778 >>> from io import StringIO 4779 >>> data = 'a\\tb\\t1\\na\\tc\\t2\\nd\\tc\\t3' 4780 >>> data_fh = StringIO(data) 4781 >>> test_table = Table.from_adjacency(data_fh) 4782 """ 4783 if not isinstance(lines, (list, tuple)): 4784 if hasattr(lines, 'readlines'): 4785 lines = lines.readlines() 4786 elif hasattr(lines, 'splitlines'): 4787 lines = lines.splitlines() 4788 else: 4789 raise ValueError("Not sure how to handle this input") 4790 4791 def is_num(item): 4792 # from https://stackoverflow.com/a/23059703 4793 numeric = re.compile(r'(?=.)([+-]?([0-9]*)(\.([0-9]+))?)([eE][+-]?\d+)?') # noqa 4794 match = numeric.match(item) 4795 start, stop = match.span() 4796 if (stop - start) == len(item): 4797 return True 4798 else: 4799 return False 4800 4801 # sanity check and determine if we have a header or not 4802 lh = lines[0].strip().split('\t') 4803 if len(lh) != 3: 4804 raise ValueError("Does not appear to be an adjacency format") 4805 elif lh == ['#OTU ID', 'SampleID', 'value']: 4806 include_line_zero = False 4807 elif is_num(lh[2]): 4808 # allow anything for columns 1 and 2, but test that column 3 is 4809 # numeric 4810 include_line_zero = True 4811 else: 4812 raise ValueError("Does not appear to be an adjacency format") 4813 4814 if not include_line_zero: 4815 lines = lines[1:] 4816 4817 # extract the entities 4818 observations = [] 4819 samples = [] 4820 values = [] 4821 for line in lines: 4822 parts = line.split('\t') 4823 assert len(parts) == 3 4824 observations.append(parts[0]) 4825 samples.append(parts[1]) 4826 values.append(float(parts[2])) 4827 4828 # determine a stable order and index positioning for the identifiers 4829 obs_order = sorted(set(observations)) 4830 samp_order = sorted(set(samples)) 4831 obs_index = {o: i for i, o in enumerate(obs_order)} 4832 samp_index = {s: i for i, s in enumerate(samp_order)} 4833 4834 # fill the matrix 4835 row = np.array([obs_index[obs] for obs in observations], dtype=int) 4836 col = np.array([samp_index[samp] for samp in samples], dtype=int) 4837 data = np.asarray(values) 4838 mat = coo_matrix((data, (row, col))) 4839 4840 return Table(mat, obs_order, samp_order) 4841 4842 @staticmethod 4843 def from_tsv(lines, obs_mapping, sample_mapping, 4844 process_func, **kwargs): 4845 """Parse a tab separated (observation x sample) formatted BIOM table 4846 4847 Parameters 4848 ---------- 4849 lines : list, or file-like object 4850 The tab delimited data to parse 4851 obs_mapping : dict or None 4852 The corresponding observation metadata 4853 sample_mapping : dict or None 4854 The corresponding sample metadata 4855 process_func : function 4856 A function to transform the observation metadata 4857 4858 Returns 4859 ------- 4860 biom.Table 4861 A BIOM ``Table`` object 4862 4863 Examples 4864 -------- 4865 Parse tab separated data into a table: 4866 4867 >>> from biom.table import Table 4868 >>> from io import StringIO 4869 >>> tsv = 'a\\tb\\tc\\n1\\t2\\t3\\n4\\t5\\t6' 4870 >>> tsv_fh = StringIO(tsv) 4871 >>> func = lambda x : x 4872 >>> test_table = Table.from_tsv(tsv_fh, None, None, func) 4873 """ 4874 (sample_ids, obs_ids, data, t_md, 4875 t_md_name) = Table._extract_data_from_tsv(lines, **kwargs) 4876 4877 # if we have it, keep it 4878 if t_md is None: 4879 obs_metadata = None 4880 else: 4881 obs_metadata = [{t_md_name: process_func(v)} for v in t_md] 4882 4883 if sample_mapping is None: 4884 sample_metadata = None 4885 else: 4886 sample_metadata = [sample_mapping[sample_id] 4887 for sample_id in sample_ids] 4888 4889 # will override any metadata from parsed table 4890 if obs_mapping is not None: 4891 obs_metadata = [obs_mapping[obs_id] for obs_id in obs_ids] 4892 4893 return Table(data, obs_ids, sample_ids, obs_metadata, sample_metadata) 4894 4895 @staticmethod 4896 def _extract_data_from_tsv(lines, delim='\t', dtype=float, md_parse=None): 4897 """Parse a classic table into (sample_ids, obs_ids, data, metadata, 4898 name) 4899 4900 Parameters 4901 ---------- 4902 lines: list or file-like object 4903 delimted data to parse 4904 delim: string 4905 delimeter in file lines 4906 dtype: type 4907 md_parse: function or None 4908 funtion used to parse metdata 4909 4910 Returns 4911 ------- 4912 list 4913 sample_ids 4914 list 4915 observation_ids 4916 array 4917 data 4918 list 4919 metadata 4920 string 4921 column name if last column is non-numeric 4922 4923 Notes 4924 ------ 4925 This is intended to be close to how QIIME classic OTU tables are parsed 4926 with the exception of the additional md_name field 4927 4928 This function is ported from QIIME (http://www.qiime.org), previously 4929 named parse_classic_otu_table. QIIME is a GPL project, but we obtained 4930 permission from the authors of this function to port it to the BIOM 4931 Format project (and keep it under BIOM's BSD license). 4932 4933 .. shownumpydoc 4934 """ 4935 def isfloat(value): 4936 # see https://stackoverflow.com/a/20929881 4937 try: 4938 float(value) 4939 return True 4940 except ValueError: 4941 return False 4942 4943 if not isinstance(lines, list): 4944 try: 4945 hasattr(lines, 'seek') 4946 except AttributeError: 4947 raise RuntimeError( 4948 "Input needs to support seek or be indexable") 4949 4950 # find header, the first line that is not empty and does not start 4951 # with a # 4952 header = False 4953 list_index = 0 4954 for line in lines: 4955 if not line.strip(): 4956 continue 4957 if not line.startswith('#'): 4958 # Covers the case where the first line is the header 4959 # and there is no indication of it (no comment character) 4960 if not header: 4961 header = line.rstrip().split(delim)[1:] 4962 data_start = list_index + 1 4963 else: 4964 data_start = list_index 4965 break 4966 list_index += 1 4967 header = line.strip().split(delim)[1:] 4968 4969 # If the first line is the header, then we need to get the data lines 4970 # line for the "last column" check 4971 if isinstance(lines, list): 4972 value_checks = lines[data_start:] 4973 else: 4974 lines.seek(0) 4975 for index in range(0, data_start): 4976 lines.readline() 4977 value_checks = [line for line in lines] 4978 4979 # attempt to determine if the last column is non-numeric, ie, metadata 4980 last_values = [line.rsplit(delim, 1)[-1].strip() 4981 for line in value_checks] 4982 last_column_is_numeric = all([isfloat(i) for i in last_values]) 4983 4984 # determine sample ids 4985 if last_column_is_numeric: 4986 md_name = None 4987 metadata = None 4988 samp_ids = header[:] 4989 else: 4990 md_name = header[-1] 4991 metadata = [] 4992 samp_ids = header[:-1] 4993 4994 data = [] 4995 obs_ids = [] 4996 row_number = 0 4997 4998 # Go back to the beginning if it is a file: 4999 if hasattr(lines, 'seek'): 5000 lines.seek(0) 5001 for index in range(0, data_start): 5002 line = lines.readline() 5003 else: 5004 lines = lines[data_start:] 5005 5006 for lineno, line in enumerate(lines, data_start): 5007 if not line.strip(): 5008 continue 5009 if line.startswith('#'): 5010 continue 5011 5012 fields = line.split(delim) 5013 fields[-1] = fields[-1].strip() 5014 obs_ids.append(fields[0]) 5015 5016 if last_column_is_numeric: 5017 try: 5018 values = list(map(dtype, fields[1:])) 5019 except ValueError: 5020 badval, badidx = _identify_bad_value(dtype, fields[1:]) 5021 msg = "Invalid value on line %d, column %d, value %s" 5022 raise TypeError(msg % (lineno, badidx+1, badval)) 5023 else: 5024 try: 5025 values = list(map(dtype, fields[1:-1])) 5026 except ValueError: 5027 badval, badidx = _identify_bad_value(dtype, fields[1:]) 5028 msg = "Invalid value on line %d, column %d, value %s" 5029 raise TypeError(msg % (lineno, badidx+1, badval)) 5030 5031 if md_parse is not None: 5032 metadata.append(md_parse(fields[-1])) 5033 else: 5034 metadata.append(fields[-1]) 5035 for column_number in range(0, len(values)): 5036 if values[column_number] != dtype(0): 5037 data.append([row_number, column_number, 5038 values[column_number]]) 5039 row_number += 1 5040 return samp_ids, obs_ids, data, metadata, md_name 5041 5042 def to_tsv(self, header_key=None, header_value=None, 5043 metadata_formatter=str, 5044 observation_column_name='#OTU ID', 5045 direct_io=None): 5046 """Return self as a string in tab delimited form 5047 5048 Default ``str`` output for the ``Table`` is just row/col ids and table 5049 data without any metadata 5050 5051 Parameters 5052 ---------- 5053 header_key : str or ``None``, optional 5054 Defaults to ``None`` 5055 header_value : str or ``None``, optional 5056 Defaults to ``None`` 5057 metadata_formatter : function, optional 5058 Defaults to ``str``. a function which takes a metadata entry and 5059 returns a formatted version that should be written to file 5060 observation_column_name : str, optional 5061 Defaults to "#OTU ID". The name of the first column in the output 5062 table, corresponding to the observation IDs. 5063 direct_io : file or file-like object, optional 5064 Defaults to ``None``. Must implement a ``write`` function. If 5065 `direct_io` is not ``None``, the final output is written directly 5066 to `direct_io` during processing. 5067 5068 Returns 5069 ------- 5070 str 5071 tab delimited representation of the Table 5072 5073 Examples 5074 -------- 5075 5076 >>> import numpy as np 5077 >>> from biom.table import Table 5078 5079 Create a 2x3 BIOM table, with observation metadata and no sample 5080 metadata: 5081 5082 >>> data = np.asarray([[0, 0, 1], [1, 3, 42]]) 5083 >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'], 5084 ... [{'foo': 'bar'}, {'x': 'y'}], None) 5085 >>> print(table.to_tsv()) # doctest: +NORMALIZE_WHITESPACE 5086 # Constructed from biom file 5087 #OTU ID S1 S2 S3 5088 O1 0.0 0.0 1.0 5089 O2 1.0 3.0 42.0 5090 >>> with open("result.tsv", "w") as f: 5091 table.to_tsv(direct_io=f) 5092 """ 5093 return self.delimited_self(u'\t', header_key, header_value, 5094 metadata_formatter, 5095 observation_column_name, 5096 direct_io=direct_io) 5097 5098 5099def coo_arrays_to_sparse(data, dtype=np.float64, shape=None): 5100 """Map directly on to the coo_matrix constructor 5101 5102 Parameters 5103 ---------- 5104 data : tuple 5105 data must be (values, (rows, cols)) 5106 dtype : type, optional 5107 Defaults to ``np.float64`` 5108 shape : tuple or ``None``, optional 5109 Defaults to ``None``. If `shape` is ``None``, shape will be determined 5110 automatically from `data`. 5111 """ 5112 if shape is None: 5113 values, (rows, cols) = data 5114 n_rows = max(rows) + 1 5115 n_cols = max(cols) + 1 5116 else: 5117 n_rows, n_cols = shape 5118 5119 # coo_matrix allows zeros to be added as data, and this affects 5120 # nnz, items, and iteritems. Clean them out here, as this is 5121 # the only time these zeros can creep in. 5122 # Note: coo_matrix allows duplicate entries; the entries will 5123 # be summed when converted. Not really sure how we want to 5124 # handle this generally within BIOM- I'm okay with leaving it 5125 # as undefined behavior for now. 5126 matrix = coo_matrix(data, shape=(n_rows, n_cols), dtype=dtype) 5127 matrix = matrix.tocsr() 5128 matrix.eliminate_zeros() 5129 return matrix 5130 5131 5132def list_list_to_sparse(data, dtype=float, shape=None): 5133 """Convert a list of lists into a scipy.sparse matrix. 5134 5135 Parameters 5136 ---------- 5137 data : iterable of iterables 5138 `data` should be in the format [[row, col, value], ...] 5139 dtype : type, optional 5140 defaults to ``float`` 5141 shape : tuple or ``None``, optional 5142 Defaults to ``None``. If `shape` is ``None``, shape will be determined 5143 automatically from `data`. 5144 5145 Returns 5146 ------- 5147 scipy.csr_matrix 5148 The newly generated matrix 5149 """ 5150 rows, cols, values = zip(*data) 5151 5152 if shape is None: 5153 n_rows = max(rows) + 1 5154 n_cols = max(cols) + 1 5155 else: 5156 n_rows, n_cols = shape 5157 5158 matrix = coo_matrix((values, (rows, cols)), shape=(n_rows, n_cols), 5159 dtype=dtype) 5160 matrix = matrix.tocsr() 5161 matrix.eliminate_zeros() 5162 return matrix 5163 5164 5165def nparray_to_sparse(data, dtype=float): 5166 """Convert a numpy array to a scipy.sparse matrix. 5167 5168 Parameters 5169 ---------- 5170 data : numpy.array 5171 The data to convert into a sparse matrix 5172 dtype : type, optional 5173 Defaults to ``float``. The type of data to be represented. 5174 5175 Returns 5176 ------- 5177 scipy.csr_matrix 5178 The newly generated matrix 5179 """ 5180 if data.shape == (0,): 5181 # an empty vector. Note, this short circuit is necessary as calling 5182 # csr_matrix([], shape=(0, 0), dtype=dtype) will result in a matrix 5183 # has a shape of (1, 0). 5184 return csr_matrix((0, 0), dtype=dtype) 5185 elif data.shape in ((1, 0), (0, 1)) and data.size == 0: 5186 # an empty matrix. This short circuit is necessary for the same reason 5187 # as the empty vector. While a (1, 0) matrix is _empty_, this does 5188 # confound code that assumes that (1, 0) means there might be metadata 5189 # or IDs associated with that singular row 5190 return csr_matrix((0, 0), dtype=dtype) 5191 elif len(data.shape) == 1: 5192 # a vector 5193 shape = (1, data.shape[0]) 5194 else: 5195 shape = data.shape 5196 5197 matrix = coo_matrix(data, shape=shape, dtype=dtype) 5198 matrix = matrix.tocsr() 5199 matrix.eliminate_zeros() 5200 return matrix 5201 5202 5203def list_nparray_to_sparse(data, dtype=float): 5204 """Takes a list of numpy arrays and creates a scipy.sparse matrix. 5205 5206 Parameters 5207 ---------- 5208 data : iterable of numpy.array 5209 The data to convert into a sparse matrix 5210 dtype : type, optional 5211 Defaults to ``float``. The type of data to be represented. 5212 5213 Returns 5214 ------- 5215 scipy.csr_matrix 5216 The newly generated matrix 5217 """ 5218 matrix = coo_matrix(data, shape=(len(data), len(data[0])), dtype=dtype) 5219 matrix = matrix.tocsr() 5220 matrix.eliminate_zeros() 5221 return matrix 5222 5223 5224def list_sparse_to_sparse(data, dtype=float): 5225 """Takes a list of scipy.sparse matrices and creates a scipy.sparse mat. 5226 5227 Parameters 5228 ---------- 5229 data : iterable of scipy.sparse matrices 5230 The data to convert into a sparse matrix 5231 dtype : type, optional 5232 Defaults to ``float``. The type of data to be represented. 5233 5234 Returns 5235 ------- 5236 scipy.csr_matrix 5237 The newly generated matrix 5238 """ 5239 if isspmatrix(data[0]): 5240 if data[0].shape[0] > data[0].shape[1]: 5241 n_cols = len(data) 5242 n_rows = data[0].shape[0] 5243 else: 5244 n_rows = len(data) 5245 n_cols = data[0].shape[1] 5246 else: 5247 all_keys = flatten([d.keys() for d in data]) 5248 n_rows = max(all_keys, key=itemgetter(0))[0] + 1 5249 n_cols = max(all_keys, key=itemgetter(1))[1] + 1 5250 if n_rows > n_cols: 5251 n_cols = len(data) 5252 else: 5253 n_rows = len(data) 5254 5255 data = vstack(data) 5256 matrix = coo_matrix(data, shape=(n_rows, n_cols), 5257 dtype=dtype) 5258 matrix = matrix.tocsr() 5259 matrix.eliminate_zeros() 5260 return matrix 5261 5262 5263def list_dict_to_sparse(data, dtype=float): 5264 """Takes a list of dict {(row,col):val} and creates a scipy.sparse mat. 5265 5266 Parameters 5267 ---------- 5268 data : iterable of dicts 5269 The data to convert into a sparse matrix 5270 dtype : type, optional 5271 Defaults to ``float``. The type of data to be represented. 5272 5273 Returns 5274 ------- 5275 scipy.csr_matrix 5276 The newly generated matrix 5277 """ 5278 if isspmatrix(data[0]): 5279 if data[0].shape[0] > data[0].shape[1]: 5280 is_col = True 5281 n_cols = len(data) 5282 n_rows = data[0].shape[0] 5283 else: 5284 is_col = False 5285 n_rows = len(data) 5286 n_cols = data[0].shape[1] 5287 else: 5288 all_keys = flatten([d.keys() for d in data]) 5289 n_rows = max(all_keys, key=itemgetter(0))[0] + 1 5290 n_cols = max(all_keys, key=itemgetter(1))[1] + 1 5291 if n_rows > n_cols: 5292 is_col = True 5293 n_cols = len(data) 5294 else: 5295 is_col = False 5296 n_rows = len(data) 5297 5298 rows = [] 5299 cols = [] 5300 vals = [] 5301 for row_idx, row in enumerate(data): 5302 for (row_val, col_idx), val in row.items(): 5303 if is_col: 5304 # transpose 5305 rows.append(row_val) 5306 cols.append(row_idx) 5307 vals.append(val) 5308 else: 5309 rows.append(row_idx) 5310 cols.append(col_idx) 5311 vals.append(val) 5312 5313 matrix = coo_matrix((vals, (rows, cols)), shape=(n_rows, n_cols), 5314 dtype=dtype) 5315 matrix = matrix.tocsr() 5316 matrix.eliminate_zeros() 5317 return matrix 5318 5319 5320def dict_to_sparse(data, dtype=float, shape=None): 5321 """Takes a dict {(row,col):val} and creates a scipy.sparse matrix. 5322 5323 Parameters 5324 ---------- 5325 data : dict 5326 The data to convert into a sparse matrix 5327 dtype : type, optional 5328 Defaults to ``float``. The type of data to be represented. 5329 5330 Returns 5331 ------- 5332 scipy.csr_matrix 5333 The newly generated matrix 5334 """ 5335 if shape is None: 5336 n_rows = max(data.keys(), key=itemgetter(0))[0] + 1 5337 n_cols = max(data.keys(), key=itemgetter(1))[1] + 1 5338 else: 5339 n_rows, n_cols = shape 5340 5341 rows = [] 5342 cols = [] 5343 vals = [] 5344 for (r, c), v in viewitems(data): 5345 rows.append(r) 5346 cols.append(c) 5347 vals.append(v) 5348 5349 return coo_arrays_to_sparse((vals, (rows, cols)), 5350 shape=(n_rows, n_cols), dtype=dtype) 5351