1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3"""
4BIOM Table (:mod:`biom.table`)
5==============================
6
7The biom-format project provides rich ``Table`` objects to support use of the
8BIOM file format. The objects encapsulate matrix data (such as OTU counts) and
9abstract the interaction away from the programmer.
10
11.. currentmodule:: biom.table
12
13Classes
14-------
15
16.. autosummary::
17   :toctree: generated/
18
19   Table
20
21Examples
22--------
23First, lets create a toy table to play around with. For this example, we're
24going to construct a 10x4 `Table`, or one that has 10 observations and 4
25samples. Each observation and sample will be given an arbitrary but unique
26name. We'll also add on some metadata.
27
28>>> import numpy as np
29>>> from biom.table import Table
30>>> data = np.arange(40).reshape(10, 4)
31>>> sample_ids = ['S%d' % i for i in range(4)]
32>>> observ_ids = ['O%d' % i for i in range(10)]
33>>> sample_metadata = [{'environment': 'A'}, {'environment': 'B'},
34...                    {'environment': 'A'}, {'environment': 'B'}]
35>>> observ_metadata = [{'taxonomy': ['Bacteria', 'Firmicutes']},
36...                    {'taxonomy': ['Bacteria', 'Firmicutes']},
37...                    {'taxonomy': ['Bacteria', 'Proteobacteria']},
38...                    {'taxonomy': ['Bacteria', 'Proteobacteria']},
39...                    {'taxonomy': ['Bacteria', 'Proteobacteria']},
40...                    {'taxonomy': ['Bacteria', 'Bacteroidetes']},
41...                    {'taxonomy': ['Bacteria', 'Bacteroidetes']},
42...                    {'taxonomy': ['Bacteria', 'Firmicutes']},
43...                    {'taxonomy': ['Bacteria', 'Firmicutes']},
44...                    {'taxonomy': ['Bacteria', 'Firmicutes']}]
45>>> table = Table(data, observ_ids, sample_ids, observ_metadata,
46...               sample_metadata, table_id='Example Table')
47
48Now that we have a table, let's explore it at a high level first.
49
50>>> table
5110 x 4 <class 'biom.table.Table'> with 39 nonzero entries (97% dense)
52>>> print(table) # doctest: +NORMALIZE_WHITESPACE
53# Constructed from biom file
54#OTU ID S0  S1  S2  S3
55O0  0.0 1.0 2.0 3.0
56O1  4.0 5.0 6.0 7.0
57O2  8.0 9.0 10.0    11.0
58O3  12.0    13.0    14.0    15.0
59O4  16.0    17.0    18.0    19.0
60O5  20.0    21.0    22.0    23.0
61O6  24.0    25.0    26.0    27.0
62O7  28.0    29.0    30.0    31.0
63O8  32.0    33.0    34.0    35.0
64O9  36.0    37.0    38.0    39.0
65>>> print(table.ids()) # doctest: +NORMALIZE_WHITESPACE
66['S0' 'S1' 'S2' 'S3']
67>>> print(table.ids(axis='observation')) # doctest: +NORMALIZE_WHITESPACE
68['O0' 'O1' 'O2' 'O3' 'O4' 'O5' 'O6' 'O7' 'O8' 'O9']
69>>> print(table.nnz)  # number of nonzero entries
7039
71
72While it's fun to just poke at the table, let's dig deeper. First, we're going
73to convert `table` into relative abundances (within each sample), and then
74filter `table` to just the samples associated with environment 'A'. The
75filtering gets fancy: we can pass in an arbitrary function to determine what
76samples we want to keep. This function must accept a sparse vector of values,
77the corresponding ID and the corresponding metadata, and should return ``True``
78or ``False``, where ``True`` indicates that the vector should be retained.
79
80>>> normed = table.norm(axis='sample', inplace=False)
81>>> filter_f = lambda values, id_, md: md['environment'] == 'A'
82>>> env_a = normed.filter(filter_f, axis='sample', inplace=False)
83>>> print(env_a) # doctest: +NORMALIZE_WHITESPACE
84# Constructed from biom file
85#OTU ID S0  S2
86O0  0.0 0.01
87O1  0.0222222222222 0.03
88O2  0.0444444444444 0.05
89O3  0.0666666666667 0.07
90O4  0.0888888888889 0.09
91O5  0.111111111111  0.11
92O6  0.133333333333  0.13
93O7  0.155555555556  0.15
94O8  0.177777777778  0.17
95O9  0.2 0.19
96
97But, what if we wanted individual tables per environment? While we could just
98perform some fancy iteration, we can instead just rely on `Table.partition` for
99these operations. `partition`, like `filter`, accepts a function. However, the
100`partition` method only passes the corresponding ID and metadata to the
101function. The function should return what partition the data are a part of.
102Within this example, we're also going to sum up our tables over the partitioned
103samples. Please note that we're using the original table (ie, not normalized)
104here.
105
106>>> part_f = lambda id_, md: md['environment']
107>>> env_tables = table.partition(part_f, axis='sample')
108>>> for partition, env_table in env_tables:
109...     print(partition, env_table.sum('sample'))
110A [ 180.  200.]
111B [ 190.  210.]
112
113For this last example, and to highlight a bit more functionality, we're going
114to first transform the table such that all multiples of three will be retained,
115while all non-multiples of three will get set to zero. Following this, we'll
116then collpase the table by taxonomy, and then convert the table into
117presence/absence data.
118
119First, let's setup the transform. We're going to define a function that takes
120the modulus of every value in the vector, and see if it is equal to zero. If it
121is equal to zero, we'll keep the value, otherwise we'll set the value to zero.
122
123>>> transform_f = lambda v,i,m: np.where(v % 3 == 0, v, 0)
124>>> mult_of_three = tform = table.transform(transform_f, inplace=False)
125>>> print(mult_of_three) # doctest: +NORMALIZE_WHITESPACE
126# Constructed from biom file
127#OTU ID S0  S1  S2  S3
128O0  0.0 0.0 0.0 3.0
129O1  0.0 0.0 6.0 0.0
130O2  0.0 9.0 0.0 0.0
131O3  12.0    0.0 0.0 15.0
132O4  0.0 0.0 18.0    0.0
133O5  0.0 21.0    0.0 0.0
134O6  24.0    0.0 0.0 27.0
135O7  0.0 0.0 30.0    0.0
136O8  0.0 33.0    0.0 0.0
137O9  36.0    0.0 0.0 39.0
138
139Next, we're going to collapse the table over the phylum level taxon. To do
140this, we're going to define a helper variable for the index position of the
141phylum (see the construction of the table above). Next, we're going to pass
142this to `Table.collapse`, and since we want to collapse over the observations,
143we'll need to specify 'observation' as the axis.
144
145>>> phylum_idx = 1
146>>> collapse_f = lambda id_, md: '; '.join(md['taxonomy'][:phylum_idx + 1])
147>>> collapsed = mult_of_three.collapse(collapse_f, axis='observation')
148>>> print(collapsed) # doctest: +NORMALIZE_WHITESPACE
149# Constructed from biom file
150#OTU ID S0  S1  S2  S3
151Bacteria; Firmicutes  7.2 6.6 7.2 8.4
152Bacteria; Proteobacteria  4.0 3.0 6.0 5.0
153Bacteria; Bacteroidetes   12.0    10.5    0.0 13.5
154
155Finally, let's convert the table to presence/absence data.
156
157>>> pa = collapsed.pa()
158>>> print(pa) # doctest: +NORMALIZE_WHITESPACE
159# Constructed from biom file
160#OTU ID S0  S1  S2  S3
161Bacteria; Firmicutes  1.0 1.0 1.0 1.0
162Bacteria; Proteobacteria  1.0 1.0 1.0 1.0
163Bacteria; Bacteroidetes   1.0 1.0 0.0 1.0
164
165"""
166
167# -----------------------------------------------------------------------------
168# Copyright (c) 2011-2020, The BIOM Format Development Team.
169#
170# Distributed under the terms of the Modified BSD License.
171#
172# The full license is in the file COPYING.txt, distributed with this software.
173# -----------------------------------------------------------------------------
174
175from __future__ import division
176import numpy as np
177import scipy.stats
178from copy import deepcopy
179from datetime import datetime
180from json import dumps
181from functools import reduce, partial
182from operator import itemgetter, or_
183from future.builtins import zip
184from future.utils import viewitems
185from collections import defaultdict, Hashable, Iterable
186from numpy import ndarray, asarray, zeros, newaxis
187from scipy.sparse import (coo_matrix, csc_matrix, csr_matrix, isspmatrix,
188                          vstack, hstack)
189import pandas as pd
190import re
191import six
192from future.utils import string_types as _future_string_types
193from biom.exception import (TableException, UnknownAxisError, UnknownIDError,
194                            DisjointIDError)
195from biom.util import (get_biom_format_version_string,
196                       get_biom_format_url_string, flatten, natsort,
197                       prefer_self, index_list, H5PY_VLEN_STR, HAVE_H5PY,
198                       __format_version__)
199from biom.err import errcheck
200from ._filter import _filter
201from ._transform import _transform
202from ._subsample import _subsample
203
204
205if not six.PY3:
206    string_types = list(_future_string_types)
207    string_types.append(str)
208    string_types.append(unicode)  # noqa
209    string_types = tuple(string_types)
210else:
211    string_types = _future_string_types
212
213
214__author__ = "Daniel McDonald"
215__copyright__ = "Copyright 2011-2020, The BIOM Format Development Team"
216__credits__ = ["Daniel McDonald", "Jai Ram Rideout", "Greg Caporaso",
217               "Jose Clemente", "Justin Kuczynski", "Adam Robbins-Pianka",
218               "Joshua Shorenstein", "Jose Antonio Navas Molina",
219               "Jorge Cañardo Alastuey", "Steven Brown"]
220__license__ = "BSD"
221__url__ = "http://biom-format.org"
222__maintainer__ = "Daniel McDonald"
223__email__ = "daniel.mcdonald@colorado.edu"
224
225
226MATRIX_ELEMENT_TYPE = {'int': int, 'float': float, 'unicode': str,
227                       u'int': int, u'float': float, u'unicode': str}
228
229
230def _identify_bad_value(dtype, fields):
231    """Identify the first value which cannot be cast
232
233    Paramters
234    ---------
235    dtype : type
236        A type to cast to
237    fields : Iterable of str
238        A series of str to cast into dtype
239
240    Returns
241    -------
242    str or None
243        A value that cannot be cast
244    int or None
245        The index of the value that cannot be cast
246    """
247    badval = None
248    badidx = None
249    for idx, v in enumerate(fields):
250        try:
251            dtype(v)
252        except:  # noqa
253            badval = v
254            badidx = idx
255            break
256    return (badval, badidx)
257
258
259def general_parser(x):
260    if isinstance(x, bytes):
261        x = x.decode('utf8')
262    return x
263
264
265def vlen_list_of_str_parser(value):
266    """Parses the taxonomy value"""
267    new_value = []
268    for v in value:
269        if v:
270            if isinstance(v, bytes):
271                v = v.decode('utf8')
272            new_value.append(v)
273
274    return new_value if new_value else None
275
276
277def general_formatter(grp, header, md, compression):
278    """Creates a dataset for a general atomic type category"""
279    shape = (len(md),)
280    name = 'metadata/%s' % header
281    dtypes = [type(m[header]) for m in md]
282
283    if set(dtypes).issubset(set(string_types)):
284        grp.create_dataset(name, shape=shape,
285                           dtype=H5PY_VLEN_STR,
286                           data=[m[header].encode('utf8') for m in md],
287                           compression=compression)
288    elif set(dtypes).issubset({list, tuple}):
289        vlen_list_of_str_formatter(grp, header, md, compression)
290    else:
291        formatted = []
292        dtypes_used = []
293        for dt, m in zip(dtypes, md):
294            val = m[header]
295            if val is None:
296                val = '\0'
297                dt = str
298
299            if dt in string_types:
300                val = val.encode('utf8')
301
302            formatted.append(val)
303            dtypes_used.append(dt)
304
305        if set(dtypes_used).issubset(set(string_types)):
306            dtype_to_use = H5PY_VLEN_STR
307        else:
308            dtype_to_use = None
309
310        # try our best...
311        grp.create_dataset(
312            name, shape=(len(md),),
313            dtype=dtype_to_use,
314            data=formatted,
315            compression=compression)
316
317
318def vlen_list_of_str_formatter(grp, header, md, compression):
319    """Creates a (N, ?) vlen str dataset"""
320    # It is possible that the value for some sample/observation
321    # is None. In that case, we still need to see them as
322    # iterables, but their length will be 0
323    iterable_checks = []
324    lengths = []
325    for m in md:
326        if m[header] is None:
327            iterable_checks.append(True)
328        elif isinstance(m.get(header), str):
329            iterable_checks.append(False)
330        else:
331            iterable_checks.append(
332                isinstance(m.get(header, []), Iterable))
333            lengths.append(len(m[header]))
334
335    if not np.all(iterable_checks):
336        if header == 'taxonomy':
337            # attempt to handle the general case issue where the taxonomy
338            # was not split on semicolons and represented as a flat string
339            # instead of a list
340            def split_and_strip(i):
341                parts = i.split(';')
342                return [p.strip() for p in parts]
343            try:
344                new_md = []
345                lengths = []
346                for m in md:
347                    parts = split_and_strip(m[header])
348                    new_md.append({header: parts})
349                    lengths.append(len(parts))
350                md = new_md
351            except:  # noqa
352                raise TypeError("Category '%s' is not formatted properly. The "
353                                "most common issue is when 'taxonomy' is "
354                                "represented as a flat string instead of a "
355                                "list. An attempt was made to split this "
356                                "field on a ';' to coerce it into a list but "
357                                "it failed. An example entry (which is not "
358                                "assured to be the problematic entry) is "
359                                "below:\n%s" % (header, md[0][header]))
360        else:
361            raise TypeError(
362                "Category %s not formatted correctly. Did you pass"
363                " --process-obs-metadata taxonomy when converting "
364                " from tsv? Please see Table.to_hdf5 docstring for"
365                " more information" % header)
366
367    max_list_len = max(lengths)
368    shape = (len(md), max_list_len)
369    data = np.empty(shape, dtype=object)
370    for i, m in enumerate(md):
371        if m[header] is None:
372            continue
373        value = np.asarray(m[header])
374        data[i, :len(value)] = [v.encode('utf8') for v in value]
375    # Change the None entries on data to empty strings ""
376    data = np.where(data == np.array(None), "", data)
377    grp.create_dataset(
378        'metadata/%s' % header, shape=shape,
379        dtype=H5PY_VLEN_STR, data=data,
380        compression=compression)
381
382
383class Table(object):
384
385    """The (canonically pronounced 'teh') Table.
386
387    Give in to the power of the Table!
388
389    Creates an in-memory representation of a BIOM file. BIOM version 1.0 is
390    based on JSON to provide the overall structure for the format while
391    versions 2.0 and 2.1 are based on HDF5. For more information see [1]_
392    and [2]_
393
394    Paramaters
395    ----------
396    data : array_like
397        An (N,M) sample by observation matrix represented as one of these
398        types:
399            An 1-dimensional array of values
400            An n-dimensional array of values
401            An empty list
402            A list of numpy arrays
403            A list of dict
404            A list of sparse matrices
405            A dictionary of values
406            A list of lists
407            A sparse matrix of values
408    observation_ids : array_like of str
409        A (N,) dataset of the observation IDs, where N is the total number
410        of IDs
411    sample_ids : array_like of str
412        A (M,) dataset of the sample IDs, where M is the total number of IDs
413    observation_metadata : list of dicts, optional
414        per observation dictionary of annotations where every key represents a
415        metadata field that contains specific metadata information,
416        ie taxonomy, KEGG pathway, etc
417    sample_metadata : array_like of dicts, optional
418        per sample dictionary of annotations where every key represents a
419        metadata field that contains sample specific metadata information, ie
420    table_id : str, optional
421        A field that can be used to identify the table
422    type : {None, "OTU table", "Pathway table", "Function table",
423            "Ortholog table", "Gene table", "Metabolite table", "Taxon table"}
424        The type of table represented
425    create_date : str, optional
426        Date that this table was built
427    generated_by : str, optional
428        Individual who built the table
429    observation_group_metadata : list, optional
430        group that contains observation specific group metadata information
431        (e.g., phylogenetic tree)
432    sample_group_metadata : list, optional
433        group that contains sample specific group metadata information
434        (e.g., relationships between samples)
435
436    Attributes
437    ----------
438    shape
439    dtype
440    nnz
441    matrix_data
442    type
443    table_id
444    create_date
445    generated_by
446    format_version
447
448    Raises
449    ------
450    TableException
451        When an invalid table type is provided.
452
453    References
454    ----------
455    .. [1] http://biom-format.org/documentation/biom_format.html
456    .. [2] D. McDonald, et al. "The Biological Observation Matrix (BIOM) format
457       or: how I learned to stop worrying and love the ome-ome"
458       GigaScience 2012 1:7
459    """
460
461    def __init__(self, data, observation_ids, sample_ids,
462                 observation_metadata=None, sample_metadata=None,
463                 table_id=None, type=None, create_date=None, generated_by=None,
464                 observation_group_metadata=None, sample_group_metadata=None,
465                 **kwargs):
466
467        self.type = type
468        self.table_id = table_id
469        self.create_date = create_date
470        self.generated_by = generated_by
471        self.format_version = __format_version__
472
473        if not isspmatrix(data):
474            shape = (len(observation_ids), len(sample_ids))
475            input_is_dense = kwargs.get('input_is_dense', False)
476            self._data = Table._to_sparse(data, input_is_dense=input_is_dense,
477                                          shape=shape)
478        else:
479            self._data = data.tocsr()
480
481        self._data = self._data.astype(float)
482
483        # using object to allow for variable length strings
484        self._sample_ids = np.asarray(sample_ids, dtype=object)
485        self._observation_ids = np.asarray(observation_ids, dtype=object)
486
487        if sample_metadata is not None:
488            # not m will evaluate True if the object tested is None or
489            # an empty dict, etc.
490            if {not m for m in sample_metadata} == {True, }:
491                self._sample_metadata = None
492            else:
493                self._sample_metadata = tuple(sample_metadata)
494        else:
495            self._sample_metadata = None
496
497        if observation_metadata is not None:
498            # not m will evaluate True if the object tested is None or
499            # an empty dict, etc.
500            if {not m for m in observation_metadata} == {True, }:
501                self._observation_metadata = None
502            else:
503                self._observation_metadata = tuple(observation_metadata)
504        else:
505            self._observation_metadata = None
506
507        self._sample_group_metadata = sample_group_metadata
508        self._observation_group_metadata = observation_group_metadata
509
510        errcheck(self)
511
512        # These will be set by _index_ids()
513        self._sample_index = None
514        self._obs_index = None
515
516        self._cast_metadata()
517        self._index_ids()
518
519    def _index_ids(self):
520        """Sets lookups {id:index in _data}.
521
522        Should only be called in constructor as this modifies state.
523        """
524        self._sample_index = index_list(self._sample_ids)
525        self._obs_index = index_list(self._observation_ids)
526
527    def _index(self, axis='sample'):
528        """Return the index lookups of the given axis
529
530        Parameters
531        ----------
532        axis : {'sample', 'observation'}, optional
533            Axis to get the index dict. Defaults to 'sample'
534
535        Returns
536        -------
537        dict
538            lookups {id:index}
539
540        Raises
541        ------
542        UnknownAxisError
543            If provided an unrecognized axis.
544        """
545        if axis == 'sample':
546            return self._sample_index
547        elif axis == 'observation':
548            return self._obs_index
549        else:
550            raise UnknownAxisError(axis)
551
552    def _conv_to_self_type(self, vals, transpose=False, dtype=None):
553        """For converting vectors to a compatible self type"""
554        if dtype is None:
555            dtype = self.dtype
556
557        if isspmatrix(vals):
558            return vals
559        else:
560            return Table._to_sparse(vals, transpose, dtype)
561
562    @staticmethod
563    def _to_dense(vec):
564        """Converts a row/col vector to a dense numpy array.
565
566        Always returns a 1-D row vector for consistency with numpy iteration
567        over arrays.
568        """
569        dense_vec = vec.toarray()
570
571        if vec.shape == (1, 1):
572            # Handle the special case where we only have a single element, but
573            # we don't want to return a numpy scalar / 0-d array. We still want
574            # to return a vector of length 1.
575            return dense_vec.reshape(1)
576        else:
577            return np.squeeze(dense_vec)
578
579    @staticmethod
580    def _to_sparse(values, transpose=False, dtype=float, input_is_dense=False,
581                   shape=None):
582        """Try to return a populated scipy.sparse matrix.
583
584        NOTE: assumes the max value observed in row and col defines the size of
585        the matrix.
586        """
587        # if it is a vector
588        if isinstance(values, ndarray) and len(values.shape) == 1:
589            if transpose:
590                mat = nparray_to_sparse(values[:, newaxis], dtype)
591            else:
592                mat = nparray_to_sparse(values, dtype)
593            return mat
594        if isinstance(values, ndarray):
595            if transpose:
596                mat = nparray_to_sparse(values.T, dtype)
597            else:
598                mat = nparray_to_sparse(values, dtype)
599            return mat
600        # the empty list
601        elif isinstance(values, list) and len(values) == 0:
602            return coo_matrix((0, 0))
603        # list of np vectors
604        elif isinstance(values, list) and isinstance(values[0], ndarray):
605            mat = list_nparray_to_sparse(values, dtype)
606            if transpose:
607                mat = mat.T
608            return mat
609        # list of dicts, each representing a row in row order
610        elif isinstance(values, list) and isinstance(values[0], dict):
611            mat = list_dict_to_sparse(values, dtype)
612            if transpose:
613                mat = mat.T
614            return mat
615        # list of scipy.sparse matrices, each representing a row in row order
616        elif isinstance(values, list) and isspmatrix(values[0]):
617            mat = list_sparse_to_sparse(values, dtype)
618            if transpose:
619                mat = mat.T
620            return mat
621        elif isinstance(values, dict):
622            mat = dict_to_sparse(values, dtype, shape)
623            if transpose:
624                mat = mat.T
625            return mat
626        elif isinstance(values, list) and isinstance(values[0], list):
627            if input_is_dense:
628                d = coo_matrix(values)
629                mat = coo_arrays_to_sparse((d.data, (d.row, d.col)),
630                                           dtype=dtype, shape=shape)
631            else:
632                mat = list_list_to_sparse(values, dtype, shape=shape)
633            return mat
634        elif isspmatrix(values):
635            mat = values
636            if transpose:
637                mat = mat.transpose()
638            return mat
639        else:
640            raise TableException("Unknown input type")
641
642    def _cast_metadata(self):
643        """Casts all metadata to defaultdict to support default values.
644
645        Should be called after any modifications to sample/observation
646        metadata.
647        """
648        def cast_metadata(md):
649            """Do the actual casting"""
650            default_md = []
651            # if we have a list of [None], set to None
652            if md is not None:
653                if md.count(None) == len(md):
654                    return None
655            if md is not None:
656                for item in md:
657                    d = defaultdict(lambda: None)
658
659                    if isinstance(item, dict):
660                        d.update(item)
661                    elif item is None:
662                        pass
663                    else:
664                        raise TableException("Unable to cast metadata: %s" %
665                                             repr(item))
666                    default_md.append(d)
667                return tuple(default_md)
668            return md
669
670        self._sample_metadata = cast_metadata(self._sample_metadata)
671        self._observation_metadata = cast_metadata(self._observation_metadata)
672
673        self._sample_group_metadata = (
674            self._sample_group_metadata
675            if self._sample_group_metadata else None)
676        self._observation_group_metadata = (
677            self._observation_group_metadata
678            if self._observation_group_metadata else None)
679
680    @property
681    def shape(self):
682        """The shape of the underlying contingency matrix"""
683        return self._data.shape
684
685    @property
686    def dtype(self):
687        """The type of the objects in the underlying contingency matrix"""
688        return self._data.dtype
689
690    @property
691    def nnz(self):
692        """Number of non-zero elements of the underlying contingency matrix"""
693        self._data.eliminate_zeros()
694        return self._data.nnz
695
696    @property
697    def matrix_data(self):
698        """The sparse matrix object"""
699        return self._data
700
701    def length(self, axis='sample'):
702        """Return the length of an axis
703
704        Parameters
705        ----------
706        axis : {'sample', 'observation'}, optional
707            The axis to operate on
708
709        Raises
710        ------
711        UnknownAxisError
712            If provided an unrecognized axis.
713
714        Examples
715        --------
716        >>> from biom import example_table
717        >>> print(example_table.length(axis='sample'))
718        3
719        >>> print(example_table.length(axis='observation'))
720        2
721        """
722        if axis not in ('sample', 'observation'):
723            raise UnknownAxisError(axis)
724
725        return self.shape[1] if axis == 'sample' else self.shape[0]
726
727    def add_group_metadata(self, group_md, axis='sample'):
728        """Take a dict of group metadata and add it to an axis
729
730        Parameters
731        ----------
732        group_md : dict of tuples
733            `group_md` should be of the form ``{category: (data type, value)``
734        axis : {'sample', 'observation'}, optional
735            The axis to operate on
736
737        Raises
738        ------
739        UnknownAxisError
740            If provided an unrecognized axis.
741        """
742        if axis == 'sample':
743            if self._sample_group_metadata is not None:
744                self._sample_group_metadata.update(group_md)
745            else:
746                self._sample_group_metadata = group_md
747        elif axis == 'observation':
748            if self._observation_group_metadata is not None:
749                self._observation_group_metadata.update(group_md)
750            else:
751                self._observation_group_metadata = group_md
752        else:
753            raise UnknownAxisError(axis)
754
755    def del_metadata(self, keys=None, axis='whole'):
756        """Remove metadata from an axis
757
758        Parameters
759        ----------
760        keys : list of str, optional
761            The keys to remove from metadata. If None, all keys from the axis
762            are removed.
763        axis : {'sample', 'observation', 'whole'}, optional
764            The axis to operate on. If 'whole', the operation is applied to
765            both the sample and observation axes.
766
767        Raises
768        ------
769        UnknownAxisError
770            If the requested axis does not exist.
771
772        Examples
773        --------
774        >>> from biom import Table
775        >>> import numpy as np
776        >>> tab = Table(np.array([[1, 2], [3, 4]]),
777        ...             ['O1', 'O2'],
778        ...             ['S1', 'S2'],
779        ...             sample_metadata=[{'barcode': 'ATGC', 'env': 'A'},
780        ...                              {'barcode': 'GGTT', 'env': 'B'}])
781        >>> tab.del_metadata(keys=['env'])
782        >>> for id, md in zip(tab.ids(), tab.metadata()):
783        ...     print(id, list(md.items()))
784        S1 [('barcode', 'ATGC')]
785        S2 [('barcode', 'GGTT')]
786        """
787        if axis == 'whole':
788            axes = ['sample', 'observation']
789        elif axis in ('sample', 'observation'):
790            axes = [axis]
791        else:
792            raise UnknownAxisError("%s is not recognized" % axis)
793
794        if keys is None:
795            if axis == 'whole':
796                self._sample_metadata = None
797                self._observation_metadata = None
798            elif axis == 'sample':
799                self._sample_metadata = None
800            else:
801                self._observation_metadata = None
802            return
803
804        for ax in axes:
805            if self.metadata(axis=ax) is None:
806                continue
807
808            for i, md in zip(self.ids(axis=ax), self.metadata(axis=ax)):
809                for k in keys:
810                    if k in md:
811                        del md[k]
812
813            # for consistency with init on absence of metadata
814            empties = {True if not md else False
815                       for md in self.metadata(axis=ax)}
816            if empties == {True, }:
817                if ax == 'sample':
818                    self._sample_metadata = None
819                else:
820                    self._observation_metadata = None
821
822    def add_metadata(self, md, axis='sample'):
823        """Take a dict of metadata and add it to an axis.
824
825        Parameters
826        ----------
827        md : dict of dict
828            `md` should be of the form ``{id: {dict_of_metadata}}``
829        axis : {'sample', 'observation'}, optional
830            The axis to operate on
831        """
832        metadata = self.metadata(axis=axis)
833        if metadata is not None:
834            for id_, md_entry in viewitems(md):
835                if self.exists(id_, axis=axis):
836                    idx = self.index(id_, axis=axis)
837                    metadata[idx].update(md_entry)
838        else:
839            ids = self.ids(axis=axis)
840            if axis == 'sample':
841                self._sample_metadata = tuple(
842                    [md[id_] if id_ in md else None for id_ in ids])
843            elif axis == 'observation':
844                self._observation_metadata = tuple(
845                    [md[id_] if id_ in md else None for id_ in ids])
846            else:
847                raise UnknownAxisError(axis)
848        self._cast_metadata()
849
850    def __getitem__(self, args):
851        """Handles row or column slices
852
853        Slicing over an individual axis is supported, but slicing over both
854        axes at the same time is not supported. Partial slices, such as
855        `foo[0, 5:10]` are not supported, however full slices are supported,
856        such as `foo[0, :]`.
857
858        Parameters
859        ----------
860        args : tuple or slice
861            The specific element (by index position) to return or an entire
862            row or column of the data.
863
864        Returns
865        -------
866        float or spmatrix
867            A float is return if a specific element is specified, otherwise a
868            spmatrix object representing a vector of sparse data is returned.
869
870        Raises
871        ------
872        IndexError
873            - If the matrix is empty
874            - If the arguments do not appear to be a tuple
875            - If a slice on row and column is specified
876            - If a partial slice is specified
877
878        Notes
879        -----
880        Switching between slicing rows and columns is inefficient.  Slicing of
881        rows requires a CSR representation, while slicing of columns requires a
882        CSC representation, and transforms are performed on the data if the
883        data are not in the required representation. These transforms can be
884        expensive if done frequently.
885
886        .. shownumpydoc
887        """
888        if self.is_empty():
889            raise IndexError("Cannot retrieve an element from an empty/null "
890                             "table.")
891
892        try:
893            row, col = args
894        except:  # noqa
895            raise IndexError("Must specify (row, col).")
896
897        if isinstance(row, slice) and isinstance(col, slice):
898            raise IndexError("Can only slice a single axis.")
899
900        if isinstance(row, slice):
901            if row.start is None and row.stop is None:
902                return self._get_col(col)
903            else:
904                raise IndexError("Can only handle full : slices per axis.")
905        elif isinstance(col, slice):
906            if col.start is None and col.stop is None:
907                return self._get_row(row)
908            else:
909                raise IndexError("Can only handle full : slices per axis.")
910        else:
911            if self._data.getformat() == 'coo':
912                self._data = self._data.tocsr()
913
914            return self._data[row, col]
915
916    def _get_row(self, row_idx):
917        """Return the row at ``row_idx``.
918
919        A row vector will be returned as a scipy.sparse matrix in csr format.
920
921        Notes
922        -----
923        Switching between slicing rows and columns is inefficient.  Slicing of
924        rows requires a CSR representation, while slicing of columns requires a
925        CSC representation, and transforms are performed on the data if the
926        data are not in the required representation. These transforms can be
927        expensive if done frequently.
928
929        """
930        self._data = self._data.tocsr()
931        return self._data.getrow(row_idx)
932
933    def _get_col(self, col_idx):
934        """Return the column at ``col_idx``.
935
936        A column vector will be returned as a scipy.sparse matrix in csc
937        format.
938
939        Notes
940        -----
941        Switching between slicing rows and columns is inefficient.  Slicing of
942        rows requires a CSR representation, while slicing of columns requires a
943        CSC representation, and transforms are performed on the data if the
944        data are not in the required representation. These transforms can be
945        expensive if done frequently.
946
947        """
948        self._data = self._data.tocsc()
949        return self._data.getcol(col_idx)
950
951    def reduce(self, f, axis):
952        """Reduce over axis using function `f`
953
954        Parameters
955        ----------
956        f : function
957            The function to use for the reduce operation
958        axis : {'sample', 'observation'}
959            The axis on which to operate
960
961        Returns
962        -------
963        numpy.array
964            A one-dimensional array representing the reduced rows
965            (observations) or columns (samples) of the data matrix
966
967        Raises
968        ------
969        UnknownAxisError
970            If `axis` is neither "sample" nor "observation"
971        TableException
972            If the table's data matrix is empty
973
974        Examples
975        --------
976        >>> import numpy as np
977        >>> from biom.table import Table
978
979        Create a 2x3 table
980
981        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
982        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'],
983        ...               [{'foo': 'bar'}, {'x': 'y'}], None)
984
985        Create a reduce function
986
987        >>> func = lambda x, y: x + y
988
989        Reduce table on samples
990
991        >>> table.reduce(func, 'sample') # doctest: +NORMALIZE_WHITESPACE
992        array([  1.,   3.,  43.])
993
994        Reduce table on observations
995
996        >>> table.reduce(func, 'observation') # doctest: +NORMALIZE_WHITESPACE
997        array([  1.,  46.])
998        """
999        if self.is_empty():
1000            raise TableException("Cannot reduce an empty table")
1001
1002        # np.apply_along_axis might reduce type conversions here and improve
1003        # speed. am opting for reduce right now as I think its more readable
1004        return asarray([reduce(f, v) for v in self.iter_data(axis=axis)])
1005
1006    def sum(self, axis='whole'):
1007        """Returns the sum by axis
1008
1009        Parameters
1010        ----------
1011        axis : {'whole', 'sample', 'observation'}, optional
1012            The axis on which to operate.
1013
1014        Returns
1015        -------
1016        numpy.array or float
1017            If `axis` is "whole", returns an float representing the whole
1018            table sum. If `axis` is either "sample" or "observation", returns a
1019            numpy.array that holds a sum for each sample or observation,
1020            respectively.
1021
1022        Examples
1023        --------
1024        >>> import numpy as np
1025        >>> from biom.table import Table
1026
1027        Create a 2x3 BIOM table:
1028
1029        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
1030        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
1031
1032        Add all values in the table:
1033
1034        >>> table.sum()
1035        array(47.0)
1036
1037        Add all values per sample:
1038
1039        >>> table.sum(axis='sample') # doctest: +NORMALIZE_WHITESPACE
1040        array([  1.,  3.,  43.])
1041
1042        Add all values per observation:
1043
1044        >>> table.sum(axis='observation') # doctest: +NORMALIZE_WHITESPACE
1045        array([  1.,  46.])
1046        """
1047        if axis == 'whole':
1048            axis = None
1049        elif axis == 'sample':
1050            axis = 0
1051        elif axis == 'observation':
1052            axis = 1
1053        else:
1054            raise UnknownAxisError(axis)
1055
1056        matrix_sum = np.squeeze(np.asarray(self._data.sum(axis=axis)))
1057
1058        # We only want to return a scalar if the whole matrix was summed.
1059        if axis is not None and matrix_sum.shape == ():
1060            matrix_sum = matrix_sum.reshape(1)
1061
1062        return matrix_sum
1063
1064    def transpose(self):
1065        """Transpose the contingency table
1066
1067        The returned table will be an entirely new table, including copies of
1068        the (transposed) data, sample/observation IDs and metadata.
1069
1070        Returns
1071        -------
1072        Table
1073            Return a new table that is the transpose of caller table.
1074        """
1075        sample_md_copy = deepcopy(self.metadata())
1076        obs_md_copy = deepcopy(self.metadata(axis='observation'))
1077
1078        if self._data.getformat() == 'lil':
1079            # lil's transpose method doesn't have the copy kwarg, but all of
1080            # the others do.
1081            self._data = self._data.tocsr()
1082
1083        # sample ids and observations are reversed becuase we trasposed
1084        return self.__class__(self._data.transpose(copy=True),
1085                              self.ids()[:], self.ids(axis='observation')[:],
1086                              sample_md_copy, obs_md_copy, self.table_id)
1087
1088    def head(self, n=5, m=5):
1089        """Get the first n rows and m columns from self
1090
1091        Parameters
1092        ----------
1093        n : int, optional
1094            The number of rows (observations) to get. This number must be
1095            greater than 0. If not specified, 5 rows will be retrieved.
1096
1097        m : int, optional
1098            The number of columns (samples) to get. This number must be
1099            greater than 0. If not specified, 5 columns will be
1100            retrieved.
1101
1102        Notes
1103        -----
1104        Like `head` for Linux like systems, requesting more rows (or columns)
1105        than exists will silently work.
1106
1107        Raises
1108        ------
1109        IndexError
1110            If `n` or `m` are <= 0.
1111
1112        Returns
1113        -------
1114        Table
1115            The subset table.
1116
1117        Examples
1118        --------
1119        >>> import numpy as np
1120        >>> from biom.table import Table
1121        >>> data = np.arange(100).reshape(5, 20)
1122        >>> obs_ids = ['O%d' % i for i in range(1, 6)]
1123        >>> samp_ids = ['S%d' % i for i in range(1, 21)]
1124        >>> table = Table(data, obs_ids, samp_ids)
1125        >>> print(table.head())  # doctest: +NORMALIZE_WHITESPACE
1126        # Constructed from biom file
1127        #OTU ID S1  S2  S3  S4  S5
1128        O1  0.0 1.0 2.0 3.0 4.0
1129        O2  20.0 21.0 22.0 23.0 24.0
1130        O3  40.0 41.0 42.0 43.0 44.0
1131        O4  60.0 61.0 62.0 63.0 64.0
1132        O5  80.0 81.0 82.0 83.0 84.0
1133
1134        """
1135        if n <= 0:
1136            raise IndexError("n cannot be <= 0.")
1137
1138        if m <= 0:
1139            raise IndexError("m cannot be <= 0.")
1140
1141        row_ids = self.ids(axis='observation')[:n]
1142        col_ids = self.ids(axis='sample')[:m]
1143
1144        table = self.filter(row_ids, axis='observation', inplace=False)
1145        return table.filter(col_ids, axis='sample')
1146
1147    def group_metadata(self, axis='sample'):
1148        """Return the group metadata of the given axis
1149
1150        Parameters
1151        ----------
1152        axis : {'sample', 'observation'}, optional
1153            Axis to search for the group metadata. Defaults to 'sample'
1154
1155        Returns
1156        -------
1157        dict
1158            The corresponding group metadata for the given axis
1159
1160        Raises
1161        ------
1162        UnknownAxisError
1163            If provided an unrecognized axis.
1164
1165        Examples
1166        --------
1167        >>> import numpy as np
1168        >>> from biom.table import Table
1169
1170        Create a 2x3 BIOM table, with group observation metadata and no group
1171        sample metadata:
1172
1173        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
1174        >>> group_observation_md = {'tree': ('newick', '(O1:0.3,O2:0.4);')}
1175        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'],
1176        ...               observation_group_metadata=group_observation_md)
1177
1178        Get the observation group metadata:
1179
1180        >>> table.group_metadata(axis='observation')
1181        {'tree': ('newick', '(O1:0.3,O2:0.4);')}
1182
1183        Get the sample group metadata:
1184
1185        >> table.group_metadata()
1186        None
1187        """
1188        if axis == 'sample':
1189            return self._sample_group_metadata
1190        elif axis == 'observation':
1191            return self._observation_group_metadata
1192        else:
1193            raise UnknownAxisError(axis)
1194
1195    def ids(self, axis='sample'):
1196        """Return the ids along the given axis
1197
1198        Parameters
1199        ----------
1200        axis : {'sample', 'observation'}, optional
1201            Axis to return ids from. Defaults to 'sample'
1202
1203        Returns
1204        -------
1205        1-D numpy array
1206            The ids along the given axis
1207
1208        Raises
1209        ------
1210        UnknownAxisError
1211            If provided an unrecognized axis.
1212
1213        Examples
1214        --------
1215        >>> import numpy as np
1216        >>> from biom.table import Table
1217
1218        Create a 2x3 BIOM table:
1219
1220        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
1221        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
1222
1223        Get the ids along the observation axis:
1224
1225        >>> print(table.ids(axis='observation'))
1226        ['O1' 'O2']
1227
1228        Get the ids along the sample axis:
1229
1230        >>> print(table.ids())
1231        ['S1' 'S2' 'S3']
1232        """
1233        if axis == 'sample':
1234            return self._sample_ids
1235        elif axis == 'observation':
1236            return self._observation_ids
1237        else:
1238            raise UnknownAxisError(axis)
1239
1240    def update_ids(self, id_map, axis='sample', strict=True, inplace=True):
1241        """Update the ids along the given axis
1242
1243        Parameters
1244        ----------
1245        id_map : dict
1246            Mapping of old to new ids
1247        axis : {'sample', 'observation'}, optional
1248            Axis to search for `id`. Defaults to 'sample'
1249        strict : bool, optional
1250            If ``True``, raise an error if an id is present in the given axis
1251            but is not a key in ``id_map``. If False, retain old identifier
1252            for ids that are present in the given axis but are not keys in
1253            ``id_map``.
1254        inplace : bool, optional
1255            If ``True`` the ids are updated in ``self``; if ``False`` the ids
1256            are updated in a new table is returned.
1257
1258        Returns
1259        -------
1260        Table
1261            Table object where ids have been updated.
1262
1263        Raises
1264        ------
1265        UnknownAxisError
1266            If provided an unrecognized axis.
1267        TableException
1268            If an id from ``self`` is not in ``id_map`` and ``strict`` is
1269            ``True``.
1270
1271        Examples
1272        --------
1273        Create a 2x3 BIOM table:
1274
1275        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
1276        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
1277
1278        Define a mapping of old to new sample ids:
1279
1280        >>> id_map = {'S1':'s1.1', 'S2':'s2.2', 'S3':'s3.3'}
1281
1282        Get the ids along the sample axis in the table:
1283
1284        >>> print(table.ids(axis='sample'))
1285        ['S1' 'S2' 'S3']
1286
1287        Update the sample ids and get the ids along the sample axis in the
1288        updated table:
1289
1290        >>> updated_table = table.update_ids(id_map, axis='sample')
1291        >>> print(updated_table.ids(axis='sample'))
1292        ['s1.1' 's2.2' 's3.3']
1293        """
1294        updated_ids = zeros(self.ids(axis=axis).size, dtype=object)
1295        for idx, old_id in enumerate(self.ids(axis=axis)):
1296            if strict and old_id not in id_map:
1297                raise TableException(
1298                    "Mapping not provided for %s identifier: %s. If this "
1299                    "identifier should not be updated, pass strict=False."
1300                    % (axis, old_id))
1301
1302            updated_ids[idx] = id_map.get(old_id, old_id)
1303
1304        # prepare the result object and update the ids along the specified
1305        # axis
1306        result = self if inplace else self.copy()
1307        if axis == 'sample':
1308            result._sample_ids = updated_ids
1309        else:
1310            result._observation_ids = updated_ids
1311
1312        result._index_ids()
1313
1314        # check for errors (specifically, we want to esnsure that duplicate
1315        # ids haven't been introduced)
1316        errcheck(result)
1317
1318        return result
1319
1320    def _get_sparse_data(self, axis='sample'):
1321        """Returns the internal data in the correct sparse representation
1322
1323        Parameters
1324        ----------
1325        axis : {'sample', 'observation'}, optional
1326            Axis to search for `id`. Defaults to 'sample'
1327
1328        Returns
1329        -------
1330        sparse matrix
1331            The data in csc (axis='sample') or csr (axis='observation')
1332            representation
1333        """
1334        if axis == 'sample':
1335            return self._data.tocsc()
1336        elif axis == 'observation':
1337            return self._data.tocsr()
1338        else:
1339            raise UnknownAxisError(axis)
1340
1341    def metadata(self, id=None, axis='sample'):
1342        """Return the metadata of the identified sample/observation.
1343
1344        Parameters
1345        ----------
1346        id : str
1347            ID of the sample or observation whose index will be returned.
1348        axis : {'sample', 'observation'}
1349            Axis to search for `id`.
1350
1351        Returns
1352        -------
1353        defaultdict or None
1354            The corresponding metadata ``defaultdict`` or ``None`` of that axis
1355            does not have metadata.
1356
1357        Raises
1358        ------
1359        UnknownAxisError
1360            If provided an unrecognized axis.
1361        UnknownIDError
1362            If provided an unrecognized sample/observation ID.
1363
1364        Examples
1365        --------
1366        >>> import numpy as np
1367        >>> from biom.table import Table
1368
1369        Create a 2x3 BIOM table, with observation metadata and no sample
1370        metadata:
1371
1372        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
1373        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'],
1374        ...               [{'foo': 'bar'}, {'x': 'y'}], None)
1375
1376        Get the metadata of the observation with ID "O2":
1377
1378        >>> # casting to `dict` as the return is `defaultdict`
1379        >>> dict(table.metadata('O2', 'observation'))
1380        {'x': 'y'}
1381
1382        Get the metadata of the sample with ID "S1":
1383
1384        >>> table.metadata('S1', 'sample') is None
1385        True
1386        """
1387        if axis == 'sample':
1388            md = self._sample_metadata
1389        elif axis == 'observation':
1390            md = self._observation_metadata
1391        else:
1392            raise UnknownAxisError(axis)
1393
1394        if id is None:
1395            return md
1396
1397        idx = self.index(id, axis=axis)
1398
1399        return md[idx] if md is not None else None
1400
1401    def index(self, id, axis):
1402        """Return the index of the identified sample/observation.
1403
1404        Parameters
1405        ----------
1406        id : str
1407            ID of the sample or observation whose index will be returned.
1408        axis : {'sample', 'observation'}
1409            Axis to search for `id`.
1410
1411        Returns
1412        -------
1413        int
1414            Index of the sample/observation identified by `id`.
1415
1416        Raises
1417        ------
1418        UnknownAxisError
1419            If provided an unrecognized axis.
1420        UnknownIDError
1421            If provided an unrecognized sample/observation ID.
1422
1423        Examples
1424        --------
1425        >>> import numpy as np
1426        >>> from biom.table import Table
1427
1428        Create a 2x3 BIOM table:
1429
1430        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
1431        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
1432
1433        Get the index of the observation with ID "O2":
1434
1435        >>> table.index('O2', 'observation')
1436        1
1437
1438        Get the index of the sample with ID "S1":
1439
1440        >>> table.index('S1', 'sample')
1441        0
1442        """
1443        idx_lookup = self._index(axis=axis)
1444
1445        if id not in idx_lookup:
1446            raise UnknownIDError(id, axis)
1447
1448        return idx_lookup[id]
1449
1450    def get_value_by_ids(self, obs_id, samp_id):
1451        """Return value in the matrix corresponding to ``(obs_id, samp_id)``
1452
1453        Parameters
1454        ----------
1455        obs_id : str
1456            The ID of the observation
1457        samp_id : str
1458            The ID of the sample
1459
1460        Returns
1461        -------
1462        float
1463            The data value corresponding to the specified matrix position
1464
1465        Examples
1466        --------
1467        >>> import numpy as np
1468        >>> from biom.table import Table
1469
1470        Create a 2x3 BIOM table:
1471
1472        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
1473        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'Z3'])
1474
1475        Retrieve the number of counts for observation `O1` in sample `Z3`.
1476
1477        >>> print(table.get_value_by_ids('O2', 'Z3'))
1478        42.0
1479
1480        See Also
1481        --------
1482        Table.data
1483        """
1484        return self[self.index(obs_id, 'observation'),
1485                    self.index(samp_id, 'sample')]
1486
1487    def __str__(self):
1488        """Stringify self
1489
1490        Default str output for a Table is just row/col ids and data values
1491        """
1492        return self.delimited_self()
1493
1494    def __repr__(self):
1495        """Returns a high-level summary of the table's properties
1496
1497        Returns
1498        -------
1499        str
1500            A string detailing the shape, class, number of nonzero entries, and
1501            table density
1502        """
1503        rows, cols = self.shape
1504        return '%d x %d %s with %d nonzero entries (%d%% dense)' % (
1505            rows, cols, repr(self.__class__), self.nnz,
1506            self.get_table_density() * 100
1507        )
1508
1509    def exists(self, id, axis="sample"):
1510        """Returns whether id exists in axis
1511
1512        Parameters
1513        ----------
1514        id: str
1515            id to check if exists
1516        axis : {'sample', 'observation'}, optional
1517            The axis to check
1518
1519        Returns
1520        -------
1521        bool
1522            ``True`` if `id` exists, ``False`` otherwise
1523
1524        Examples
1525        --------
1526        >>> import numpy as np
1527        >>> from biom.table import Table
1528
1529        Create a 2x3 BIOM table:
1530
1531        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
1532        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
1533
1534        Check whether sample ID is in the table:
1535
1536        >>> table.exists('S1')
1537        True
1538        >>> table.exists('S4')
1539        False
1540
1541        Check whether an observation ID is in the table:
1542
1543        >>> table.exists('O1', 'observation')
1544        True
1545        >>> table.exists('O3', 'observation')
1546        False
1547        """
1548        return id in self._index(axis=axis)
1549
1550    def delimited_self(self, delim=u'\t', header_key=None, header_value=None,
1551                       metadata_formatter=str,
1552                       observation_column_name=u'#OTU ID', direct_io=None):
1553        """Return self as a string in a delimited form
1554
1555        Default str output for the Table is just row/col ids and table data
1556        without any metadata
1557
1558        Including observation metadata in output: If ``header_key`` is not
1559        ``None``, the observation metadata with that name will be included
1560        in the delimited output. If ``header_value`` is also not ``None``, the
1561        observation metadata will use the provided ``header_value`` as the
1562        observation metadata name (i.e., the column header) in the delimited
1563        output.
1564
1565        ``metadata_formatter``: a function which takes a metadata entry and
1566        returns a formatted version that should be written to file
1567
1568        ``observation_column_name``: the name of the first column in the output
1569        table, corresponding to the observation IDs. For example, the default
1570        will look something like:
1571
1572            #OTU ID\tSample1\tSample2
1573            OTU1\t10\t2
1574            OTU2\t4\t8
1575        """
1576        def to_utf8(i):
1577            if isinstance(i, bytes):
1578                return i.decode('utf8')
1579            else:
1580                return str(i)
1581
1582        if self.is_empty():
1583            raise TableException("Cannot delimit self if I don't have data...")
1584
1585        samp_ids = delim.join([to_utf8(i) for i in self.ids()])
1586        # 17 hrs of straight programming later...
1587        if header_key is not None:
1588            if header_value is None:
1589                raise TableException(
1590                    "You need to specify both header_key and header_value")
1591
1592        if header_value is not None:
1593            if header_key is None:
1594                raise TableException(
1595                    "You need to specify both header_key and header_value")
1596
1597        if header_value:
1598            output = [u'# Constructed from biom file',
1599                      u'%s%s%s\t%s' % (observation_column_name, delim,
1600                                       samp_ids, header_value)]
1601        else:
1602            output = ['# Constructed from biom file',
1603                      '%s%s%s' % (observation_column_name, delim, samp_ids)]
1604
1605        if direct_io is not None:
1606            direct_io.writelines([i+"\n" for i in output])
1607
1608        obs_metadata = self.metadata(axis='observation')
1609        iterable = self.ids(axis='observation')
1610        end_line = '' if direct_io is None else '\n'
1611
1612        for obs_id, obs_values in zip(iterable,
1613                                      self._iter_obs()):
1614            str_obs_vals = delim.join(map(str, self._to_dense(obs_values)))
1615            obs_id = to_utf8(obs_id)
1616            if header_key and obs_metadata is not None:
1617                md = obs_metadata[self._obs_index[obs_id]]
1618                md_out = metadata_formatter(md.get(header_key, None))
1619                output_row = u'%s%s%s\t%s%s' % \
1620                    (obs_id, delim, str_obs_vals, md_out, end_line)
1621
1622                if direct_io is None:
1623                    output.append(output_row)
1624                else:
1625                    direct_io.write(output_row)
1626            else:
1627                output_row = u'%s%s%s%s' % \
1628                            (obs_id, delim, str_obs_vals, end_line)
1629                if direct_io is None:
1630                    output.append(output_row)
1631                else:
1632                    direct_io.write((output_row))
1633
1634        return '\n'.join(output)
1635
1636    def is_empty(self):
1637        """Check whether the table is empty
1638
1639        Returns
1640        -------
1641        bool
1642            ``True`` if the table is empty, ``False`` otherwise
1643        """
1644        if not self.ids().size or not self.ids(axis='observation').size:
1645            return True
1646        else:
1647            return False
1648
1649    def __iter__(self):
1650        """See ``biom.table.Table.iter``"""
1651        return self.iter()
1652
1653    def _iter_samp(self):
1654        """Return sample vectors of data matrix vectors"""
1655        for c in range(self.shape[1]):
1656            # this pulls out col vectors but need to convert to the expected
1657            # row vector
1658            colvec = self._get_col(c)
1659            yield colvec.transpose(copy=True)
1660
1661    def _iter_obs(self):
1662        """Return observation vectors of data matrix"""
1663        for r in range(self.shape[0]):
1664            yield self._get_row(r)
1665
1666    def get_table_density(self):
1667        """Returns the fraction of nonzero elements in the table.
1668
1669        Returns
1670        -------
1671        float
1672            The fraction of nonzero elements in the table
1673        """
1674        density = 0.0
1675
1676        if not self.is_empty():
1677            density = (self.nnz /
1678                       (len(self.ids()) * len(self.ids(axis='observation'))))
1679
1680        return density
1681
1682    def descriptive_equality(self, other):
1683        """For use in testing, describe how the tables are not equal"""
1684        if not isinstance(other, self.__class__):
1685            return "Tables are not of comparable classes"
1686        if not self.type == other.type:
1687            return "Tables are not the same type"
1688        if not np.array_equal(self.ids(axis='observation'),
1689                              other.ids(axis='observation')):
1690            return "Observation IDs are not the same"
1691        if not np.array_equal(self.ids(), other.ids()):
1692            return "Sample IDs are not the same"
1693        if not np.array_equal(self.metadata(axis='observation'),
1694                              other.metadata(axis='observation')):
1695            return "Observation metadata are not the same"
1696        if not np.array_equal(self.metadata(), other.metadata()):
1697            return "Sample metadata are not the same"
1698        if not self._data_equality(other._data):
1699            return "Data elements are not the same"
1700
1701        return "Tables appear equal"
1702
1703    def __eq__(self, other):
1704        """Equality is determined by the data matrix, metadata, and IDs"""
1705        if not isinstance(other, self.__class__):
1706            return False
1707        if self.type != other.type:
1708            return False
1709        if not np.array_equal(self.ids(axis='observation'),
1710                              other.ids(axis='observation')):
1711            return False
1712        if not np.array_equal(self.ids(), other.ids()):
1713            return False
1714        if not np.array_equal(self.metadata(axis='observation'),
1715                              other.metadata(axis='observation')):
1716            return False
1717        if not np.array_equal(self.metadata(), other.metadata()):
1718            return False
1719        if not self._data_equality(other._data):
1720            return False
1721
1722        return True
1723
1724    def _data_equality(self, other):
1725        """Return ``True`` if both matrices are equal.
1726
1727        Matrices are equal iff the following items are equal:
1728        - shape
1729        - dtype
1730        - size (nnz)
1731        - matrix data (more expensive, so checked last)
1732
1733        The sparse format does not need to be the same between the two
1734        matrices. ``self`` and ``other`` will be converted to csr format if
1735        necessary before performing the final comparison.
1736
1737        """
1738        if self._data.shape != other.shape:
1739            return False
1740
1741        if self._data.dtype != other.dtype:
1742            return False
1743
1744        if self._data.nnz != other.nnz:
1745            return False
1746
1747        self._data = self._data.tocsr()
1748        other = other.tocsr()
1749
1750        if (self._data != other).nnz > 0:
1751            return False
1752
1753        return True
1754
1755    def __ne__(self, other):
1756        return not (self == other)
1757
1758    def data(self, id, axis='sample', dense=True):
1759        """Returns data associated with an `id`
1760
1761        Parameters
1762        ----------
1763        id : str
1764            ID of the sample or observation whose data will be returned.
1765        axis : {'sample', 'observation'}
1766            Axis to search for `id`.
1767        dense : bool, optional
1768            If ``True``, return data as dense
1769
1770        Returns
1771        -------
1772        np.ndarray or scipy.sparse.spmatrix
1773            np.ndarray if ``dense``, otherwise scipy.sparse.spmatrix
1774
1775        Raises
1776        ------
1777        UnknownAxisError
1778            If provided an unrecognized axis.
1779
1780        Examples
1781        --------
1782        >>> from biom import example_table
1783        >>> example_table.data('S1', axis='sample')
1784        array([ 0.,  3.])
1785
1786        See Also
1787        --------
1788        Table.get_value_by_ids
1789
1790        """
1791        if axis == 'sample':
1792            data = self[:, self.index(id, 'sample')]
1793        elif axis == 'observation':
1794            data = self[self.index(id, 'observation'), :]
1795        else:
1796            raise UnknownAxisError(axis)
1797
1798        if dense:
1799            return self._to_dense(data)
1800        else:
1801            return data
1802
1803    def copy(self):
1804        """Returns a copy of the table"""
1805        return self.__class__(self._data.copy(),
1806                              self.ids(axis='observation').copy(),
1807                              self.ids().copy(),
1808                              deepcopy(self.metadata(axis='observation')),
1809                              deepcopy(self.metadata()),
1810                              self.table_id,
1811                              type=self.type)
1812
1813    def iter_data(self, dense=True, axis='sample'):
1814        """Yields axis values
1815
1816        Parameters
1817        ----------
1818        dense : bool, optional
1819            Defaults to ``True``. If ``False``, yield compressed sparse row or
1820            compressed sparse columns if `axis` is 'observation' or 'sample',
1821            respectively.
1822        axis : {'sample', 'observation'}, optional
1823            Axis to iterate over.
1824
1825        Returns
1826        -------
1827        generator
1828            Yields list of values for each value in `axis`
1829
1830        Raises
1831        ------
1832        UnknownAxisError
1833            If axis other than 'sample' or 'observation' passed
1834
1835        Examples
1836        --------
1837        >>> import numpy as np
1838        >>> from biom.table import Table
1839        >>> data = np.arange(30).reshape(3,10) # 3 X 10 OTU X Sample table
1840        >>> obs_ids = ['o1', 'o2', 'o3']
1841        >>> sam_ids = ['s%i' %i for i in range(1,11)]
1842        >>> bt = Table(data, observation_ids=obs_ids, sample_ids=sam_ids)
1843
1844        Lets find the sample with the largest sum
1845
1846        >>> sample_gen = bt.iter_data(axis='sample')
1847        >>> max_sample_count = max([sample.sum() for sample in sample_gen])
1848        >>> print(max_sample_count)
1849        57.0
1850        """
1851        if axis == "sample":
1852            for samp_v in self._iter_samp():
1853                if dense:
1854                    yield self._to_dense(samp_v)
1855                else:
1856                    yield samp_v
1857        elif axis == "observation":
1858            for obs_v in self._iter_obs():
1859                if dense:
1860                    yield self._to_dense(obs_v)
1861                else:
1862                    yield obs_v
1863        else:
1864            raise UnknownAxisError(axis)
1865
1866    def iter(self, dense=True, axis='sample'):
1867        """Yields ``(value, id, metadata)``
1868
1869
1870        Parameters
1871        ----------
1872        dense : bool, optional
1873            Defaults to ``True``. If ``False``, yield compressed sparse row or
1874            compressed sparse columns if `axis` is 'observation' or 'sample',
1875            respectively.
1876        axis : {'sample', 'observation'}, optional
1877            The axis to iterate over.
1878
1879        Returns
1880        -------
1881        GeneratorType
1882            A generator that yields (values, id, metadata)
1883
1884        Examples
1885        --------
1886        >>> import numpy as np
1887        >>> from biom.table import Table
1888
1889        Create a 2x3 BIOM table:
1890
1891        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
1892        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'Z3'])
1893
1894        Iter over samples and keep those that start with an Z:
1895
1896        >>> [(values, id, metadata)
1897        ...     for values, id, metadata in table.iter() if id[0]=='Z']
1898        [(array([  1.,  42.]), 'Z3', None)]
1899
1900        Iter over observations and add the 2nd column of the values
1901
1902        >>> col = [values[1] for values, id, metadata in table.iter()]
1903        >>> sum(col)
1904        46.0
1905        """
1906        ids = self.ids(axis=axis)
1907        metadata = self.metadata(axis=axis)
1908        if axis == 'sample':
1909            iter_ = self._iter_samp()
1910        elif axis == 'observation':
1911            iter_ = self._iter_obs()
1912        else:
1913            raise UnknownAxisError(axis)
1914
1915        if metadata is None:
1916            metadata = (None,) * len(ids)
1917
1918        iter_ = self.iter_data(axis=axis, dense=dense)
1919
1920        return zip(iter_, ids, metadata)
1921
1922    def iter_pairwise(self, dense=True, axis='sample', tri=True, diag=False):
1923        """Pairwise iteration over self
1924
1925        Parameters
1926        ----------
1927        dense : bool, optional
1928            Defaults to ``True``. If ``False``, yield compressed sparse row or
1929            compressed sparse columns if `axis` is 'observation' or 'sample',
1930            respectively.
1931        axis : {'sample', 'observation'}, optional
1932            The axis to iterate over.
1933        tri : bool, optional
1934            If ``True``, just yield [i, j] and not [j, i]
1935        diag : bool, optional
1936            If ``True``, yield [i, i]
1937
1938        Returns
1939        -------
1940        GeneratorType
1941            Yields [(val_i, id_i, metadata_i), (val_j, id_j, metadata_j)]
1942
1943        Raises
1944        ------
1945        UnknownAxisError
1946
1947        Examples
1948        --------
1949        >>> from biom import example_table
1950
1951        By default, only the upper triangle without the diagonal  of the
1952        resulting pairwise combinations is yielded.
1953
1954        >>> iter_ = example_table.iter_pairwise()
1955        >>> for (val_i, id_i, md_i), (val_j, id_j, md_j) in iter_:
1956        ...     print(id_i, id_j)
1957        S1 S2
1958        S1 S3
1959        S2 S3
1960
1961        The full pairwise combinations can also be yielded though.
1962
1963        >>> iter_ = example_table.iter_pairwise(tri=False, diag=True)
1964        >>> for (val_i, id_i, md_i), (val_j, id_j, md_j) in iter_:
1965        ...     print(id_i, id_j)
1966        S1 S1
1967        S1 S2
1968        S1 S3
1969        S2 S1
1970        S2 S2
1971        S2 S3
1972        S3 S1
1973        S3 S2
1974        S3 S3
1975
1976        """
1977        metadata = self.metadata(axis=axis)
1978        ids = self.ids(axis=axis)
1979
1980        if metadata is None:
1981            metadata = (None,) * len(ids)
1982
1983        ind = np.arange(len(ids))
1984        diag_v = 1 - diag  # for offseting tri_f, where a 0 includes the diag
1985
1986        if tri:
1987            def tri_f(idx):
1988                return ind[idx+diag_v:]
1989        else:
1990            def tri_f(idx):
1991                return np.hstack([ind[:idx], ind[idx+diag_v:]])
1992
1993        for idx, i in enumerate(ind):
1994            id_i = ids[i]
1995            md_i = metadata[i]
1996            data_i = self.data(id_i, axis=axis, dense=dense)
1997
1998            for j in tri_f(idx):
1999                id_j = ids[j]
2000                md_j = metadata[j]
2001                data_j = self.data(id_j, axis=axis, dense=dense)
2002
2003                yield ((data_i, id_i, md_i), (data_j, id_j, md_j))
2004
2005    def sort_order(self, order, axis='sample'):
2006        """Return a new table with `axis` in `order`
2007
2008        Parameters
2009        ----------
2010        order : iterable
2011            The desired order for axis
2012        axis : {'sample', 'observation'}, optional
2013            The axis to operate on
2014
2015        Returns
2016        -------
2017        Table
2018            A table where the observations or samples are sorted according to
2019            `order`
2020
2021        Examples
2022        --------
2023
2024        >>> import numpy as np
2025        >>> from biom.table import Table
2026
2027        Create a 2x3 BIOM table:
2028
2029        >>> data = np.asarray([[1, 0, 4], [1, 3, 0]])
2030        >>> table = Table(data, ['O2', 'O1'], ['S2', 'S1', 'S3'])
2031        >>> print(table) # doctest: +NORMALIZE_WHITESPACE
2032        # Constructed from biom file
2033        #OTU ID S2  S1  S3
2034        O2  1.0 0.0 4.0
2035        O1  1.0 3.0 0.0
2036
2037        Sort the table using a list of samples:
2038
2039        >>> sorted_table = table.sort_order(['S2', 'S3', 'S1'])
2040        >>> print(sorted_table) # doctest: +NORMALIZE_WHITESPACE
2041        # Constructed from biom file
2042        #OTU ID	S2	S3	S1
2043        O2	1.0	4.0	0.0
2044        O1	1.0	0.0	3.0
2045
2046
2047        Additionally you could sort the table's observations:
2048
2049        >>> sorted_table = table.sort_order(['O1', 'O2'], axis="observation")
2050        >>> print(sorted_table) # doctest: +NORMALIZE_WHITESPACE
2051        # Constructed from biom file
2052        #OTU ID	S2	S1	S3
2053        O1	1.0	3.0	0.0
2054        O2	1.0	0.0	4.0
2055
2056        """
2057        fancy = np.array([self.index(i, axis=axis) for i in order], dtype=int)
2058        metadata = self.metadata(axis=axis)
2059        if metadata is not None:
2060            metadata = np.array(metadata)[fancy]
2061
2062        if axis == 'sample':
2063            mat = self.matrix_data[:, fancy]
2064            return self.__class__(mat,
2065                                  self.ids(axis='observation')[:], order[:],
2066                                  self.metadata(axis='observation'), metadata,
2067                                  self.table_id, self.type)
2068
2069        elif axis == 'observation':
2070            mat = self.matrix_data[fancy, :]
2071            return self.__class__(mat,
2072                                  order[:], self.ids()[:],
2073                                  metadata, self.metadata(), self.table_id,
2074                                  self.type)
2075        else:
2076            raise UnknownAxisError(axis)
2077
2078    def sort(self, sort_f=natsort, axis='sample'):
2079        """Return a table sorted along axis
2080
2081        Parameters
2082        ----------
2083        sort_f : function, optional
2084            Defaults to ``biom.util.natsort``. A function that takes a list of
2085            values and sorts it
2086        axis : {'sample', 'observation'}, optional
2087            The axis to operate on
2088
2089        Returns
2090        -------
2091        biom.Table
2092            A table whose samples or observations are sorted according to the
2093            `sort_f` function
2094
2095        Examples
2096        --------
2097        >>> import numpy as np
2098        >>> from biom.table import Table
2099
2100        Create a 2x3 BIOM table:
2101
2102        >>> data = np.asarray([[1, 0, 4], [1, 3, 0]])
2103        >>> table = Table(data, ['O2', 'O1'], ['S2', 'S1', 'S3'])
2104        >>> print(table) # doctest: +NORMALIZE_WHITESPACE
2105        # Constructed from biom file
2106        #OTU ID S2  S1  S3
2107        O2  1.0 0.0 4.0
2108        O1  1.0 3.0 0.0
2109
2110        Sort the order of samples in the table using the default natural
2111        sorting:
2112
2113        >>> new_table = table.sort()
2114        >>> print(new_table) # doctest: +NORMALIZE_WHITESPACE
2115        # Constructed from biom file
2116        #OTU ID S1  S2  S3
2117        O2  0.0 1.0 4.0
2118        O1  3.0 1.0 0.0
2119
2120        Sort the order of observations in the table using the default natural
2121        sorting:
2122
2123        >>> new_table = table.sort(axis='observation')
2124        >>> print(new_table) # doctest: +NORMALIZE_WHITESPACE
2125        # Constructed from biom file
2126        #OTU ID S2  S1  S3
2127        O1  1.0 3.0 0.0
2128        O2  1.0 0.0 4.0
2129
2130        Sort the samples in reverse order using a custom sort function:
2131
2132        >>> sort_f = lambda x: list(sorted(x, reverse=True))
2133        >>> new_table = table.sort(sort_f=sort_f)
2134        >>> print(new_table)  # doctest: +NORMALIZE_WHITESPACE
2135        # Constructed from biom file
2136        #OTU ID S3  S2  S1
2137        O2  4.0 1.0 0.0
2138        O1  0.0 1.0 3.0
2139        """
2140        return self.sort_order(sort_f(self.ids(axis=axis)), axis=axis)
2141
2142    def filter(self, ids_to_keep, axis='sample', invert=False, inplace=True):
2143        """Filter a table based on a function or iterable.
2144
2145        Parameters
2146        ----------
2147        ids_to_keep : iterable, or function(values, id, metadata) -> bool
2148            If a function, it will be called with the values of the
2149            sample/observation, its id (a string) and the dictionary
2150            of metadata of each sample/observation, and must return a
2151            boolean. If it's an iterable, it must be a list of ids to
2152            keep.
2153        axis : {'sample', 'observation'}, optional
2154            It controls whether to filter samples or observations and
2155            defaults to "sample".
2156        invert : bool, optional
2157            Defaults to ``False``. If set to ``True``, discard samples or
2158            observations where `ids_to_keep` returns True
2159        inplace : bool, optional
2160            Defaults to ``True``. Whether to return a new table or modify
2161            itself.
2162
2163        Returns
2164        -------
2165        biom.Table
2166            Returns itself if `inplace`, else returns a new filtered table.
2167
2168        Raises
2169        ------
2170        UnknownAxisError
2171            If provided an unrecognized axis.
2172
2173        Examples
2174        --------
2175        >>> import numpy as np
2176        >>> from biom.table import Table
2177
2178        Create a 2x3 BIOM table, with observation metadata and sample
2179        metadata:
2180
2181        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
2182        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'],
2183        ...               [{'full_genome_available': True},
2184        ...                {'full_genome_available': False}],
2185        ...               [{'sample_type': 'a'}, {'sample_type': 'a'},
2186        ...                {'sample_type': 'b'}])
2187
2188        Define a function to keep only samples with sample_type == 'a'. This
2189        will drop sample S3, which has sample_type 'b':
2190
2191        >>> filter_fn = lambda val, id_, md: md['sample_type'] == 'a'
2192
2193        Get a filtered version of the table, leaving the original table
2194        untouched:
2195
2196        >>> new_table = table.filter(filter_fn, inplace=False)
2197        >>> print(table.ids())
2198        ['S1' 'S2' 'S3']
2199        >>> print(new_table.ids())
2200        ['S1' 'S2']
2201
2202        Using the same filtering function, discard all samples with sample_type
2203        'a'. This will keep only sample S3, which has sample_type 'b':
2204
2205        >>> new_table = table.filter(filter_fn, inplace=False, invert=True)
2206        >>> print(table.ids())
2207        ['S1' 'S2' 'S3']
2208        >>> print(new_table.ids())
2209        ['S3']
2210
2211        Filter the table in-place using the same function (drop all samples
2212        where sample_type is not 'a'):
2213
2214        >>> table.filter(filter_fn)
2215        2 x 2 <class 'biom.table.Table'> with 2 nonzero entries (50% dense)
2216        >>> print(table.ids())
2217        ['S1' 'S2']
2218
2219        Filter out all observations in the table that do not have
2220        full_genome_available == True. This will filter out observation O2:
2221
2222        >>> filter_fn = lambda val, id_, md: md['full_genome_available']
2223        >>> table.filter(filter_fn, axis='observation')
2224        1 x 2 <class 'biom.table.Table'> with 0 nonzero entries (0% dense)
2225        >>> print(table.ids(axis='observation'))
2226        ['O1']
2227
2228        """
2229        table = self if inplace else self.copy()
2230
2231        metadata = table.metadata(axis=axis)
2232        ids = table.ids(axis=axis)
2233        index = self._index(axis=axis)
2234        axis = table._axis_to_num(axis=axis)
2235
2236        arr = table._data
2237        arr, ids, metadata = _filter(arr,
2238                                     ids,
2239                                     metadata,
2240                                     index,
2241                                     ids_to_keep,
2242                                     axis,
2243                                     invert=invert)
2244
2245        table._data = arr
2246        if axis == 1:
2247            table._sample_ids = ids
2248            table._sample_metadata = metadata
2249        elif axis == 0:
2250            table._observation_ids = ids
2251            table._observation_metadata = metadata
2252
2253        table._index_ids()
2254        errcheck(table)
2255
2256        return table
2257
2258    def partition(self, f, axis='sample'):
2259        """Yields partitions
2260
2261        Parameters
2262        ----------
2263        f : function
2264            `f` is given the ID and metadata of the vector and must return
2265            what partition the vector is part of.
2266        axis : {'sample', 'observation'}, optional
2267            The axis to iterate over
2268
2269        Returns
2270        -------
2271        GeneratorType
2272            A generator that yields (partition, `Table`)
2273
2274        Examples
2275        --------
2276        >>> import numpy as np
2277        >>> from biom.table import Table
2278        >>> from biom.util import unzip
2279
2280        Create a 2x3 BIOM table, with observation metadata and sample
2281        metadata:
2282
2283        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
2284        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'],
2285        ...               [{'full_genome_available': True},
2286        ...                {'full_genome_available': False}],
2287        ...               [{'sample_type': 'a'}, {'sample_type': 'a'},
2288        ...                {'sample_type': 'b'}])
2289
2290        Define a function to bin by sample_type
2291
2292        >>> f = lambda id_, md: md['sample_type']
2293
2294        Partition the table and view results
2295
2296        >>> bins, tables = table.partition(f)
2297        >>> print(bins[1]) # doctest: +NORMALIZE_WHITESPACE
2298        # Constructed from biom file
2299        #OTU ID S1  S2
2300        O1  0.0 0.0
2301        O2  1.0 3.0
2302        >>> print(tables[1]) # doctest: +NORMALIZE_WHITESPACE
2303        # Constructed from biom file
2304        #OTU ID S3
2305        O1  1.0
2306        O2  42.0
2307        """
2308        partitions = {}
2309        # conversion of vector types is not necessary, vectors are not
2310        # being passed to an arbitrary function
2311        for vals, id_, md in self.iter(dense=False, axis=axis):
2312            part = f(id_, md)
2313
2314            # try to make it hashable...
2315            if not isinstance(part, Hashable):
2316                part = tuple(part)
2317
2318            if part not in partitions:
2319                partitions[part] = [[], [], []]
2320
2321            partitions[part][0].append(id_)
2322            partitions[part][1].append(vals)
2323            partitions[part][2].append(md)
2324
2325        md = self.metadata(axis=self._invert_axis(axis))
2326
2327        for part, (ids, values, metadata) in viewitems(partitions):
2328            if axis == 'sample':
2329                data = self._conv_to_self_type(values, transpose=True)
2330                samp_ids = ids
2331                samp_md = metadata
2332                obs_ids = self.ids(axis='observation')[:]
2333                obs_md = md[:] if md is not None else None
2334
2335            elif axis == 'observation':
2336                data = self._conv_to_self_type(values, transpose=False)
2337                obs_ids = ids
2338                obs_md = metadata
2339                samp_ids = self.ids()[:]
2340                samp_md = md[:] if md is not None else None
2341
2342            yield part, Table(data, obs_ids, samp_ids, obs_md, samp_md,
2343                              self.table_id, type=self.type)
2344
2345    def collapse(self, f, collapse_f=None, norm=True, min_group_size=1,
2346                 include_collapsed_metadata=True, one_to_many=False,
2347                 one_to_many_mode='add', one_to_many_md_key='Path',
2348                 strict=False, axis='sample'):
2349        """Collapse partitions in a table by metadata or by IDs
2350
2351        Partition data by metadata or IDs and then collapse each partition into
2352        a single vector.
2353
2354        If `include_collapsed_metadata` is ``True``, the metadata for the
2355        collapsed partition will be a category named 'collapsed_ids', in which
2356        a list of the original ids that made up the partition is retained
2357
2358        The remainder is only relevant to setting `one_to_many` to ``True``.
2359
2360        If `one_to_many` is ``True``, allow vectors to collapse into multiple
2361        bins if the metadata describe a one-many relationship. Supplied
2362        functions must allow for iteration support over the metadata key and
2363        must return a tuple of (path, bin) as to describe both the path in the
2364        hierarchy represented and the specific bin being collapsed into. The
2365        uniqueness of the bin is _not_ based on the path but by the name of the
2366        bin.
2367
2368        The metadata value for the corresponding collapsed column may include
2369        more (or less) information about the collapsed data. For example, if
2370        collapsing "FOO", and there are vectors that span three associations A,
2371        B, and C, such that vector 1 spans A and B, vector 2 spans B and C and
2372        vector 3 spans A and C, the resulting table will contain three
2373        collapsed vectors:
2374
2375        - A, containing original vectors 1 and 3
2376        - B, containing original vectors 1 and 2
2377        - C, containing original vectors 2 and 3
2378
2379        If a vector maps to the same partition multiple times, it will be
2380        counted multiple times.
2381
2382        There are two supported modes for handling one-to-many relationships
2383        via `one_to_many_mode`: ``add`` and `divide`. ``add`` will add the
2384        vector counts to each partition that the vector maps to, which may
2385        increase the total number of counts in the output table. ``divide``
2386        will divide a vectors's counts by the number of metadata that the
2387        vector has before adding the counts to each partition. This will not
2388        increase the total number of counts in the output table.
2389
2390        If `one_to_many_md_key` is specified, that becomes the metadata
2391        key that describes the collapsed path. If a value is not specified,
2392        then it defaults to 'Path'.
2393
2394        If `strict` is specified, then all metadata pathways operated on
2395        must be indexable by `metadata_f`.
2396
2397        `one_to_many` and `norm` are not supported together.
2398
2399        `one_to_many` and `collapse_f` are not supported together.
2400
2401        `one_to_many` and `min_group_size` are not supported together.
2402
2403        A final note on space consumption. At present, the `one_to_many`
2404        functionality requires a temporary dense matrix representation.
2405
2406        Parameters
2407        ----------
2408        f : function
2409            Function that is used to determine what partition a vector belongs
2410            to
2411        collapse_f : function, optional
2412            Function that collapses a partition in a one-to-one collapse. The
2413            expected function signature is:
2414
2415                dense or sparse_vector <- collapse_f(Table, axis)
2416
2417            Defaults to a pairwise add.
2418
2419        norm : bool, optional
2420            Defaults to ``True``. If ``True``, normalize the resulting table
2421        min_group_size : int, optional
2422            Defaults to ``1``. The minimum size of a partition when performing
2423            a one-to-one collapse
2424        include_collapsed_metadata : bool, optional
2425            Defaults to ``True``. If ``True``, retain the collapsed metadata
2426            keyed by the original IDs of the associated vectors
2427        one_to_many : bool, optional
2428            Defaults to ``False``. Perform a one-to-many collapse
2429        one_to_many_mode : {'add', 'divide'}, optional
2430            The way to reduce two vectors in a one-to-many collapse
2431        one_to_many_md_key : str, optional
2432            Defaults to "Path". If `include_collapsed_metadata` is ``True``,
2433            store the original vector metadata under this key
2434        strict : bool, optional
2435            Defaults to ``False``. Requires full pathway data within a
2436            one-to-many structure
2437        axis : {'sample', 'observation'}, optional
2438            The axis to collapse
2439
2440        Returns
2441        -------
2442        Table
2443            The collapsed table
2444
2445        Examples
2446        --------
2447        >>> import numpy as np
2448        >>> from biom.table import Table
2449
2450        Create a ``Table``
2451
2452        >>> dt_rich = Table(
2453        ...    np.array([[5, 6, 7], [8, 9, 10], [11, 12, 13]]),
2454        ...    ['1', '2', '3'], ['a', 'b', 'c'],
2455        ...    [{'taxonomy': ['k__a', 'p__b']},
2456        ...     {'taxonomy': ['k__a', 'p__c']},
2457        ...     {'taxonomy': ['k__a', 'p__c']}],
2458        ...    [{'barcode': 'aatt'},
2459        ...     {'barcode': 'ttgg'},
2460        ...     {'barcode': 'aatt'}])
2461        >>> print(dt_rich) # doctest: +NORMALIZE_WHITESPACE
2462        # Constructed from biom file
2463        #OTU ID a   b   c
2464        1   5.0 6.0 7.0
2465        2   8.0 9.0 10.0
2466        3   11.0    12.0    13.0
2467
2468        Create Function to determine what partition a vector belongs to
2469
2470        >>> bin_f = lambda id_, x: x['taxonomy'][1]
2471        >>> obs_phy = dt_rich.collapse(
2472        ...    bin_f, norm=False, min_group_size=1,
2473        ...    axis='observation').sort(axis='observation')
2474        >>> print(obs_phy) # doctest: +NORMALIZE_WHITESPACE
2475        # Constructed from biom file
2476        #OTU ID a   b   c
2477        p__b    5.0 6.0 7.0
2478        p__c    19.0    21.0    23.0
2479        """
2480        collapsed_data = []
2481        collapsed_ids = []
2482
2483        if include_collapsed_metadata:
2484            collapsed_md = []
2485        else:
2486            collapsed_md = None
2487
2488        if one_to_many_mode not in ['add', 'divide']:
2489            raise ValueError("Unrecognized one-to-many mode '%s'. Must be "
2490                             "either 'add' or 'divide'." % one_to_many_mode)
2491
2492        # transpose is only necessary in the one-to-one case
2493        # new_data_shape is only necessary in the one-to-many case
2494        # axis_slice is only necessary in the one-to-many case
2495        def axis_ids_md(t):
2496            return (t.ids(axis=axis), t.metadata(axis=axis))
2497
2498        if axis == 'sample':
2499            transpose = True
2500
2501            def new_data_shape(ids, collapsed):
2502                return (len(ids), len(collapsed))
2503
2504            def axis_slice(lookup, key):
2505                return (slice(None), lookup[key])
2506
2507        elif axis == 'observation':
2508            transpose = False
2509
2510            def new_data_shape(ids, collapsed):
2511                return (len(collapsed), len(ids))
2512
2513            def axis_slice(lookup, key):
2514                return (lookup[key], slice(None))
2515
2516        else:
2517            raise UnknownAxisError(axis)
2518
2519        if one_to_many:
2520            if norm:
2521                raise AttributeError(
2522                    "norm and one_to_many are not supported together")
2523
2524            # determine the collapsed pathway
2525            # we drop all other associated metadata
2526            new_md = {}
2527            md_count = {}
2528
2529            for id_, md in zip(*axis_ids_md(self)):
2530                md_iter = f(id_, md)
2531                num_md = 0
2532                while True:
2533                    try:
2534                        pathway, partition = next(md_iter)
2535                    except IndexError:
2536                        # if a pathway is incomplete
2537                        if strict:
2538                            # bail if strict
2539                            err = "Incomplete pathway, ID: %s, metadata: %s" %\
2540                                  (id_, md)
2541                            raise IndexError(err)
2542                        else:
2543                            # otherwise ignore
2544                            continue
2545                    except StopIteration:
2546                        break
2547
2548                    new_md[partition] = pathway
2549                    num_md += 1
2550
2551                md_count[id_] = num_md
2552
2553            idx_lookup = {part: i for i, part in enumerate(sorted(new_md))}
2554
2555            # We need to store floats, not ints, as things won't always divide
2556            # evenly.
2557            dtype = float if one_to_many_mode == 'divide' else self.dtype
2558
2559            new_data = zeros(new_data_shape(self.ids(self._invert_axis(axis)),
2560                                            new_md),
2561                             dtype=dtype)
2562
2563            # for each vector
2564            # for each bin in the metadata
2565            # for each partition associated with the vector
2566            for vals, id_, md in self.iter(axis=axis):
2567                md_iter = f(id_, md)
2568
2569                while True:
2570                    try:
2571                        pathway, part = next(md_iter)
2572                    except IndexError:
2573                        # if a pathway is incomplete
2574                        if strict:
2575                            # bail if strict, should never get here...
2576                            err = "Incomplete pathway, ID: %s, metadata: %s" %\
2577                                  (id_, md)
2578                            raise IndexError(err)
2579                        else:
2580                            # otherwise ignore
2581                            continue
2582                    except StopIteration:
2583                        break
2584                    if one_to_many_mode == 'add':
2585                        new_data[axis_slice(idx_lookup, part)] += vals
2586                    else:
2587                        new_data[axis_slice(idx_lookup, part)] += \
2588                            vals / md_count[id_]
2589
2590            if include_collapsed_metadata:
2591                # reassociate pathway information
2592                for k, i in sorted(viewitems(idx_lookup), key=itemgetter(1)):
2593                    collapsed_md.append({one_to_many_md_key: new_md[k]})
2594
2595            # get the new sample IDs
2596            collapsed_ids = [k for k, i in sorted(viewitems(idx_lookup),
2597                                                  key=itemgetter(1))]
2598
2599            # convert back to self type
2600            data = self._conv_to_self_type(new_data)
2601        else:
2602            if collapse_f is None:
2603                def collapse_f(t, axis):
2604                    return t.sum(axis)
2605
2606            for part, table in self.partition(f, axis=axis):
2607                axis_ids, axis_md = axis_ids_md(table)
2608
2609                if len(axis_ids) < min_group_size:
2610                    continue
2611
2612                redux_data = collapse_f(table, self._invert_axis(axis))
2613                if norm:
2614                    redux_data /= len(axis_ids)
2615
2616                collapsed_data.append(self._conv_to_self_type(redux_data))
2617                collapsed_ids.append(part)
2618
2619                if include_collapsed_metadata:
2620                    # retain metadata but store by original id
2621                    collapsed_md.append({'collapsed_ids': axis_ids.tolist()})
2622
2623            data = self._conv_to_self_type(collapsed_data, transpose=transpose)
2624
2625        # if the table is empty
2626        errcheck(self, 'empty')
2627
2628        md = self.metadata(axis=self._invert_axis(axis))
2629        if axis == 'sample':
2630            sample_ids = collapsed_ids
2631            sample_md = collapsed_md
2632            obs_ids = self.ids(axis='observation')[:]
2633            obs_md = md if md is not None else None
2634        else:
2635            sample_ids = self.ids()[:]
2636            obs_ids = collapsed_ids
2637            obs_md = collapsed_md
2638            sample_md = md if md is not None else None
2639
2640        return Table(data, obs_ids, sample_ids, obs_md, sample_md,
2641                     self.table_id, type=self.type)
2642
2643    def _invert_axis(self, axis):
2644        """Invert an axis"""
2645        if axis == 'sample':
2646            return 'observation'
2647        elif axis == 'observation':
2648            return 'sample'
2649        else:
2650            return UnknownAxisError(axis)
2651
2652    def _axis_to_num(self, axis):
2653        """Convert str axis to numerical axis"""
2654        if axis == 'sample':
2655            return 1
2656        elif axis == 'observation':
2657            return 0
2658        else:
2659            raise UnknownAxisError(axis)
2660
2661    def min(self, axis='sample'):
2662        """Get the minimum nonzero value over an axis
2663
2664        Parameters
2665        ----------
2666        axis : {'sample', 'observation', 'whole'}, optional
2667            Defaults to "sample". The axis over which to calculate minima.
2668
2669        Returns
2670        -------
2671        scalar of self.dtype or np.array of self.dtype
2672
2673        Raises
2674        ------
2675        UnknownAxisError
2676            If provided an unrecognized axis.
2677
2678        Examples
2679        --------
2680        >>> from biom import example_table
2681        >>> print(example_table.min(axis='sample'))
2682        [ 3.  1.  2.]
2683
2684        """
2685        if axis not in ['sample', 'observation', 'whole']:
2686            raise UnknownAxisError(axis)
2687
2688        if axis == 'whole':
2689            min_val = np.inf
2690            for data in self.iter_data(dense=False):
2691                # only min over the actual nonzero values
2692                min_val = min(min_val, data.data.min())
2693        else:
2694            min_val = zeros(len(self.ids(axis=axis)), dtype=self.dtype)
2695
2696            for idx, data in enumerate(self.iter_data(dense=False, axis=axis)):
2697                min_val[idx] = data.data.min()
2698
2699        return min_val
2700
2701    def max(self, axis='sample'):
2702        """Get the maximum nonzero value over an axis
2703
2704        Parameters
2705        ----------
2706        axis : {'sample', 'observation', 'whole'}, optional
2707            Defaults to "sample". The axis over which to calculate maxima.
2708
2709        Returns
2710        -------
2711        scalar of self.dtype or np.array of self.dtype
2712
2713        Raises
2714        ------
2715        UnknownAxisError
2716            If provided an unrecognized axis.
2717
2718        Examples
2719        --------
2720        >>> from biom import example_table
2721        >>> print(example_table.max(axis='observation'))
2722        [ 2.  5.]
2723
2724        """
2725        if axis not in ['sample', 'observation', 'whole']:
2726            raise UnknownAxisError(axis)
2727
2728        if axis == 'whole':
2729            max_val = -np.inf
2730            for data in self.iter_data(dense=False):
2731                # only min over the actual nonzero values
2732                max_val = max(max_val, data.data.max())
2733        else:
2734            max_val = np.empty(len(self.ids(axis=axis)), dtype=self.dtype)
2735
2736            for idx, data in enumerate(self.iter_data(dense=False, axis=axis)):
2737                max_val[idx] = data.data.max()
2738
2739        return max_val
2740
2741    def subsample(self, n, axis='sample', by_id=False, with_replacement=False):
2742        """Randomly subsample without replacement.
2743
2744        Parameters
2745        ----------
2746        n : int
2747            Number of items to subsample from `counts`.
2748        axis : {'sample', 'observation'}, optional
2749            The axis to sample over
2750        by_id : boolean, optional
2751            If `False`, the subsampling is based on the counts contained in the
2752            matrix (e.g., rarefaction). If `True`, the subsampling is based on
2753            the IDs (e.g., fetch a random subset of samples). Default is
2754            `False`.
2755        with_replacement : boolean, optional
2756            If `False` (default), subsample without replacement. If `True`,
2757            resample with replacement via the multinomial distribution.
2758            Should not be `True` if `by_id` is `True`.
2759
2760        Returns
2761        -------
2762        biom.Table
2763            A subsampled version of self
2764
2765        Raises
2766        ------
2767        ValueError
2768            - If `n` is less than zero.
2769            - If `by_id` and `with_replacement` are both True.
2770
2771        Notes
2772        -----
2773        Subsampling is performed without replacement. If `n` is greater than
2774        the sum of a given vector, that vector is omitted from the result.
2775
2776        Adapted from `skbio.math.subsample`, see biom-format/licenses for more
2777        information about scikit-bio.
2778
2779        This code assumes absolute abundance if `by_id` is False.
2780
2781        Examples
2782        --------
2783        >>> import numpy as np
2784        >>> from biom.table import Table
2785        >>> table = Table(np.array([[0, 2, 3], [1, 0, 2]]), ['O1', 'O2'],
2786        ...               ['S1', 'S2', 'S3'])
2787
2788        Subsample 1 item over the sample axis by value (e.g., rarefaction):
2789
2790        >>> print(table.subsample(1).sum(axis='sample'))
2791        [ 1.  1.  1.]
2792
2793        Subsample 2 items over the sample axis, note that 'S1' is filtered out:
2794
2795        >>> ss = table.subsample(2)
2796        >>> print(ss.sum(axis='sample'))
2797        [ 2.  2.]
2798        >>> print(ss.ids())
2799        ['S2' 'S3']
2800
2801        Subsample by IDs over the sample axis. For this example, we're going to
2802        randomly select 2 samples and do this 100 times, and then print out the
2803        set of IDs observed.
2804
2805        >>> ids = set([tuple(table.subsample(2, by_id=True).ids())
2806        ...            for i in range(100)])
2807        >>> print(sorted(ids))
2808        [('S1', 'S2'), ('S1', 'S3'), ('S2', 'S3')]
2809
2810        """
2811        if n < 0:
2812            raise ValueError("n cannot be negative.")
2813
2814        if with_replacement and by_id:
2815            raise ValueError("by_id and with_replacement cannot both be True")
2816
2817        table = self.copy()
2818
2819        if by_id:
2820            ids = table.ids(axis=axis).copy()
2821            np.random.shuffle(ids)
2822            subset = set(ids[:n])
2823            table.filter(lambda v, i, md: i in subset, axis=axis)
2824        else:
2825            data = table._get_sparse_data()
2826            _subsample(data, n, with_replacement)
2827            table._data = data
2828
2829            table.filter(lambda v, i, md: v.sum() > 0, axis=axis)
2830
2831        inv_axis = self._invert_axis(axis)
2832        table.filter(lambda v, i, md: v.sum() > 0, axis=inv_axis)
2833
2834        return table
2835
2836    def pa(self, inplace=True):
2837        """Convert the table to presence/absence data
2838
2839        Parameters
2840        ----------
2841        inplace : bool, optional
2842            Defaults to ``True``
2843
2844        Returns
2845        -------
2846        Table
2847            Returns itself if `inplace`, else returns a new presence/absence
2848            table.
2849
2850        Examples
2851        --------
2852        >>> from biom.table import Table
2853        >>> import numpy as np
2854
2855        Create a 2x3 BIOM table
2856
2857        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
2858        >>> table = table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
2859
2860        Convert to presence/absence data
2861
2862        >>> _ = table.pa()
2863        >>> print(table.data('O1', 'observation'))
2864        [ 0.  0.  1.]
2865        >>> print(table.data('O2', 'observation'))
2866        [ 1.  1.  1.]
2867        """
2868        def transform_f(data, id_, metadata):
2869            return np.where(data != 0, 1., 0.)
2870
2871        return self.transform(transform_f, inplace=inplace)
2872
2873    def transform(self, f, axis='sample', inplace=True):
2874        """Iterate over `axis`, applying a function `f` to each vector.
2875
2876        Only non null values can be modified and the density of the
2877        table can't increase. However, zeroing values is fine.
2878
2879        Parameters
2880        ----------
2881        f : function(data, id, metadata) -> new data
2882            A function that takes three values: an array of nonzero
2883            values corresponding to each observation or sample, an
2884            observation or sample id, and an observation or sample
2885            metadata entry. It must return an array of transformed
2886            values that replace the original values.
2887        axis : {'sample', 'observation'}, optional
2888            The axis to operate on. Can be "sample" or "observation".
2889        inplace : bool, optional
2890            Defaults to ``True``. Whether to return a new table or modify
2891            itself.
2892
2893        Returns
2894        -------
2895        biom.Table
2896            Returns itself if `inplace`, else returns a new transformed table.
2897
2898        Raises
2899        ------
2900        UnknownAxisError
2901            If provided an unrecognized axis.
2902
2903        Examples
2904        --------
2905        >>> import numpy as np
2906        >>> from biom.table import Table
2907
2908        Create a 2x3 table
2909
2910        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
2911        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'],
2912        ...               [{'foo': 'bar'}, {'x': 'y'}], None)
2913        >>> print(table) # doctest: +NORMALIZE_WHITESPACE
2914        # Constructed from biom file
2915        #OTU ID S1  S2  S3
2916        O1  0.0 0.0 1.0
2917        O2  1.0 3.0 42.0
2918
2919        Create a transform function
2920
2921        >>> f = lambda data, id_, md: data / 2
2922
2923        Transform to a new table on samples
2924
2925        >>> table2 = table.transform(f, 'sample', False)
2926        >>> print(table2) # doctest: +NORMALIZE_WHITESPACE
2927        # Constructed from biom file
2928        #OTU ID S1  S2  S3
2929        O1  0.0 0.0 0.5
2930        O2  0.5 1.5 21.0
2931
2932        `table` hasn't changed
2933
2934        >>> print(table) # doctest: +NORMALIZE_WHITESPACE
2935        # Constructed from biom file
2936        #OTU ID S1  S2  S3
2937        O1  0.0 0.0 1.0
2938        O2  1.0 3.0 42.0
2939
2940        Tranform in place on observations
2941
2942        >>> table3 = table.transform(f, 'observation', True)
2943
2944        `table` is different now
2945
2946        >>> print(table) # doctest: +NORMALIZE_WHITESPACE
2947        # Constructed from biom file
2948        #OTU ID S1  S2  S3
2949        O1  0.0 0.0 0.5
2950        O2  0.5 1.5 21.0
2951
2952        but the table returned (`table3`) is the same as `table`
2953
2954        >>> print(table3) # doctest: +NORMALIZE_WHITESPACE
2955        # Constructed from biom file
2956        #OTU ID S1  S2  S3
2957        O1  0.0 0.0 0.5
2958        O2  0.5 1.5 21.0
2959
2960        """
2961        table = self if inplace else self.copy()
2962
2963        metadata = table.metadata(axis=axis)
2964        ids = table.ids(axis=axis)
2965        arr = table._get_sparse_data(axis=axis)
2966
2967        axis = table._axis_to_num(axis)
2968
2969        _transform(arr, ids, metadata, f, axis)
2970        arr.eliminate_zeros()
2971
2972        table._data = arr
2973
2974        return table
2975
2976    def rankdata(self, axis='sample', inplace=True, method='average'):
2977        """Convert values to rank abundances from smallest to largest
2978
2979        Parameters
2980        ----------
2981        axis : {'sample', 'observation'}, optional
2982            The axis to use for ranking.
2983        inplace : bool, optional
2984            Defaults to ``True``. If ``True``, performs the ranking in
2985            place. Otherwise, returns a new table with ranking applied.
2986        method : str, optional
2987            The method for handling ties in counts. This can be any valid
2988            string that can be passed to `scipy.stats.rankdata`.
2989
2990        Returns
2991        -------
2992        biom.Table
2993            The rank-abundance-transformed table.
2994
2995        Raises
2996        ------
2997        ValueError
2998            If unknown ``method`` is provided.
2999
3000        See Also
3001        --------
3002        scipy.stats.rankdata
3003
3004        Examples
3005        --------
3006        >>> import numpy as np
3007        >>> from biom import Table
3008        >>> data = np.array([[ 99,  12,   8], [  0,  42,   7],
3009        ...                  [112,  42,   6], [  5,  75,   5]])
3010        >>> t = Table(data, sample_ids=['s1', 's2', 's3'],
3011        ...           observation_ids=['o1', 'o2', 'o3', 'o4'])
3012
3013        Convert observation counts to their ranked abundance, from smallest
3014        to largest.
3015
3016        >>> print(t.rankdata())  # doctest: +NORMALIZE_WHITESPACE
3017        # Constructed from biom file
3018        #OTU ID	s1	s2	s3
3019        o1	2.0	1.0	4.0
3020        o2	0.0	2.5	3.0
3021        o3	3.0	2.5	2.0
3022        o4	1.0	4.0	1.0
3023
3024        """
3025        def f(val, id_, _):
3026            return scipy.stats.rankdata(val, method=method)
3027        return self.transform(f, axis=axis, inplace=inplace)
3028
3029    def norm(self, axis='sample', inplace=True):
3030        """Normalize in place sample values by an observation, or vice versa.
3031
3032        Parameters
3033        ----------
3034        axis : {'sample', 'observation'}, optional
3035            The axis to use for normalization.
3036        inplace : bool, optional
3037            Defaults to ``True``. If ``True``, performs the normalization in
3038            place. Otherwise, returns a new table with the normalization
3039            applied.
3040
3041        Returns
3042        -------
3043        biom.Table
3044            The normalized table
3045
3046        Examples
3047        --------
3048        >>> import numpy as np
3049        >>> from biom.table import Table
3050
3051        Create a 2x2 table:
3052
3053        >>> data = np.asarray([[2, 0], [6, 1]])
3054        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2'])
3055
3056        Get a version of the table normalized on the 'sample' axis, leaving the
3057        original table untouched:
3058
3059        >>> new_table = table.norm(inplace=False)
3060        >>> print(table) # doctest: +NORMALIZE_WHITESPACE
3061        # Constructed from biom file
3062        #OTU ID S1  S2
3063        O1  2.0 0.0
3064        O2  6.0 1.0
3065        >>> print(new_table) # doctest: +NORMALIZE_WHITESPACE
3066        # Constructed from biom file
3067        #OTU ID S1  S2
3068        O1  0.25    0.0
3069        O2  0.75    1.0
3070
3071        Get a version of the table normalized on the 'observation' axis,
3072        again leaving the original table untouched:
3073
3074        >>> new_table = table.norm(axis='observation', inplace=False)
3075        >>> print(table) # doctest: +NORMALIZE_WHITESPACE
3076        # Constructed from biom file
3077        #OTU ID S1  S2
3078        O1  2.0 0.0
3079        O2  6.0 1.0
3080        >>> print(new_table) # doctest: +NORMALIZE_WHITESPACE
3081        # Constructed from biom file
3082        #OTU ID S1  S2
3083        O1  1.0 0.0
3084        O2  0.857142857143  0.142857142857
3085
3086        Do the same normalization on 'observation', this time in-place:
3087
3088        >>> table.norm(axis='observation')
3089        2 x 2 <class 'biom.table.Table'> with 3 nonzero entries (75% dense)
3090        >>> print(table) # doctest: +NORMALIZE_WHITESPACE
3091        # Constructed from biom file
3092        #OTU ID S1  S2
3093        O1  1.0 0.0
3094        O2  0.857142857143  0.142857142857
3095        """
3096        def f(val, id_, _):
3097            return val / float(val.sum())
3098
3099        return self.transform(f, axis=axis, inplace=inplace)
3100
3101    def nonzero(self):
3102        """Yields locations of nonzero elements within the data matrix
3103
3104        Returns
3105        -------
3106        generator
3107            Yields ``(observation_id, sample_id)`` for each nonzero element
3108        """
3109        csr = self._data.tocsr()
3110        samp_ids = self.ids()
3111        obs_ids = self.ids(axis='observation')
3112
3113        indptr = csr.indptr
3114        indices = csr.indices
3115        for row_idx in range(indptr.size - 1):
3116            start = indptr[row_idx]
3117            end = indptr[row_idx+1]
3118
3119            obs_id = obs_ids[row_idx]
3120            for col_idx in indices[start:end]:
3121                yield (obs_id, samp_ids[col_idx])
3122
3123    def nonzero_counts(self, axis, binary=True):
3124        """Get nonzero summaries about an axis
3125
3126        Parameters
3127        ----------
3128        axis : {'sample', 'observation', 'whole'}
3129            The axis on which to count nonzero entries
3130        binary : bool, optional
3131            Defaults to ``True``. If ``True``, return number of nonzero
3132            entries. If ``False``, sum the values of the entries.
3133
3134        Returns
3135        -------
3136        numpy.array
3137            Counts in index order to the axis
3138        """
3139        if binary:
3140            dtype = 'int'
3141
3142            def op(x):
3143                return x.nonzero()[0].size
3144        else:
3145            dtype = self.dtype
3146
3147            def op(x):
3148                return x.sum()
3149
3150        if axis in ('sample', 'observation'):
3151            # can use np.bincount for CSMat or ScipySparse
3152            result = zeros(len(self.ids(axis=axis)), dtype=dtype)
3153            for idx, vals in enumerate(self.iter_data(axis=axis)):
3154                result[idx] = op(vals)
3155        else:
3156            result = zeros(1, dtype=dtype)
3157            for vals in self.iter_data():
3158                result[0] += op(vals)
3159
3160        return result
3161
3162    def _union_id_order(self, a, b):
3163        """Determines merge order for id lists A and B"""
3164        all_ids = list(a[:])
3165        all_ids.extend(b[:])
3166        new_order = {}
3167        idx = 0
3168        for id_ in all_ids:
3169            if id_ not in new_order:
3170                new_order[id_] = idx
3171                idx += 1
3172        return new_order
3173
3174    def _intersect_id_order(self, a, b):
3175        """Determines the merge order for id lists A and B"""
3176        all_b = set(b[:])
3177        new_order = {}
3178        idx = 0
3179        for id_ in a:
3180            if id_ in all_b:
3181                new_order[id_] = idx
3182                idx += 1
3183        return new_order
3184
3185    def remove_empty(self, axis='whole', inplace=True):
3186        """Remove empty samples or observations from the table
3187
3188        Parameters
3189        ----------
3190        axis : {'whole', 'sample', 'observation'}, optional
3191            The axis on which to operate.
3192        inplace : bool, optional
3193            If ``True`` vectors are removed in ``self``; if ``False`` the
3194            vectors are removed in a new table is returned.
3195
3196        Raises
3197        ------
3198        UnknownAxisError
3199            If the axis is not recognized.
3200
3201        Returns
3202        -------
3203        Table
3204            A table object with the zero'd rows, or columns removed as
3205            specified by the `axis` parameter.
3206        """
3207        if axis not in ['sample', 'observation', 'whole']:
3208            raise UnknownAxisError(axis)
3209
3210        if inplace:
3211            table = self
3212        else:
3213            table = self.copy()
3214
3215        if axis == 'whole':
3216            axes = ['sample', 'observation']
3217        else:
3218            axes = [axis]
3219
3220        for ax in axes:
3221            table.filter(table.ids(axis=ax)[table.sum(axis=ax) > 0], axis=ax)
3222
3223        return table
3224
3225    def align_to(self, other, axis='detect'):
3226        """Align self to other over a requested axis
3227
3228        Parameters
3229        ----------
3230        other : biom.Table
3231            The table to align too
3232        axis : str, optional, {sample, observation, both, detect}
3233            If 'sample' or 'observation', align to that axis. If 'both', align
3234            both axes. If 'detect', align what can be aligned.
3235
3236        Raises
3237        ------
3238        DisjointIDError
3239            If the requested axis can't be aligned.
3240        UnknownAxisError
3241            If an unrecognized axis is specified.
3242
3243        Examples
3244        --------
3245        Align one table to another, for instance a table of 16S data to a table
3246        of metagenomic data. In this example, we're aligning the samples of the
3247        two tables.
3248
3249        >>> from biom import Table
3250        >>> import numpy as np
3251        >>> amplicon = Table(np.array([[0, 1, 2], [3, 4, 5]]),
3252        ...                  ['Ecoli', 'Staphylococcus'],
3253        ...                  ['S1', 'S2', 'S3'])
3254        >>> metag = Table(np.array([[6, 7, 8], [9, 10, 11]]),
3255        ...               ['geneA', 'geneB'],
3256        ...               ['S3', 'S2', 'S1'])
3257        >>> amplicon = amplicon.align_to(metag)
3258        >>> print(amplicon)  # doctest: +NORMALIZE_WHITESPACE
3259        # Constructed from biom file
3260        #OTU ID	S3	S2	S1
3261        Ecoli	2.0	1.0	0.0
3262        Staphylococcus	5.0	4.0	3.0
3263        """
3264        self_o = set(self.ids(axis='observation'))
3265        self_s = set(self.ids())
3266        other_o = set(other.ids(axis='observation'))
3267        other_s = set(other.ids())
3268
3269        alignable_o = self_o == other_o
3270        alignable_s = self_s == other_s
3271
3272        if axis == 'both' and not (alignable_o and alignable_s):
3273            raise DisjointIDError("Cannot align both axes")
3274        elif axis == 'sample' and not alignable_s:
3275            raise DisjointIDError("Cannot align samples")
3276        elif axis == 'observation' and not alignable_o:
3277            raise DisjointIDError("Cannot align observations")
3278        elif axis == 'detect' and not (alignable_o or alignable_s):
3279            raise DisjointIDError("Neither axis appears alignable")
3280
3281        if axis == 'both':
3282            order = ['observation', 'sample']
3283        elif axis == 'detect':
3284            order = []
3285            if alignable_s:
3286                order.append('sample')
3287            if alignable_o:
3288                order.append('observation')
3289        elif axis == 'sample':
3290            order = ['sample']
3291        elif axis == 'observation':
3292            order = ['observation']
3293        else:
3294            raise UnknownAxisError("Unrecognized axis: %s" % axis)
3295
3296        table = self
3297        for aln_axis in order:
3298            table = table.sort_order(other.ids(axis=aln_axis),
3299                                     axis=aln_axis)
3300
3301        return table
3302
3303    def concat(self, others, axis='sample'):
3304        """Concatenate tables if axis is disjoint
3305
3306        Parameters
3307        ----------
3308        others : iterable of biom.Table, or a single biom.Table instance
3309            Tables to concatenate
3310        axis : {'sample', 'observation'}, optional
3311            The axis to concatenate on. i.e., if axis is 'sample', then tables
3312            will be joined such that the set of sample IDs in the resulting
3313            table will be the union of sample IDs across all tables in others.
3314
3315        Raises
3316        ------
3317        DisjointIDError
3318            If IDs over the axis are not disjoint.
3319
3320        Notes
3321        -----
3322        The type of the table is inherited from self.
3323
3324        Examples
3325        --------
3326        Concatenate three tables in which the sample IDs are disjoint. Note
3327        the observation IDs in this example are not disjoint (although they
3328        can be):
3329
3330        >>> from biom import Table
3331        >>> import numpy as np
3332        >>> a = Table(np.array([[0, 1, 2], [3, 4, 5]]), ['O1', 'O2'],
3333        ...                     ['S1', 'S2', 'S3'],
3334        ...                     [{'taxonomy': 'foo'}, {'taxonomy': 'bar'}])
3335        >>> b = Table(np.array([[6, 7, 8], [9, 10, 11]]), ['O3', 'O4'],
3336        ...                     ['S4', 'S5', 'S6'],
3337        ...                     [{'taxonomy': 'baz'}, {'taxonomy': 'foobar'}])
3338        >>> c = Table(np.array([[12, 13, 14], [15, 16, 17]]), ['O1', 'O5'],
3339        ...                     ['S7', 'S8', 'S9'],
3340        ...                     [{'taxonomy': 'foo'}, {'taxonomy': 'biz'}])
3341        >>> d = a.concat([b, c])
3342        >>> print(d)  # doctest: +NORMALIZE_WHITESPACE
3343        # Constructed from biom file
3344        #OTU ID	S1	S2	S3	S4	S5	S6	S7	S8	S9
3345        O1	0.0	1.0	2.0	0.0	0.0	0.0	12.0	13.0	14.0
3346        O2	3.0	4.0	5.0	0.0	0.0	0.0	0.0	0.0	0.0
3347        O3	0.0	0.0	0.0	6.0	7.0	8.0	0.0	0.0	0.0
3348        O4	0.0	0.0	0.0	9.0	10.0	11.0	0.0	0.0	0.0
3349        O5	0.0	0.0	0.0	0.0	0.0	0.0	15.0	16.0	17.0
3350
3351        """
3352        if isinstance(others, self.__class__):
3353            others = [others, ]
3354
3355        # we grow along the opposite axis
3356        invaxis = self._invert_axis(axis)
3357        if axis == 'sample':
3358            dim_getter = itemgetter(1)
3359            stack = hstack
3360            invstack = vstack
3361        else:
3362            dim_getter = itemgetter(0)
3363            stack = vstack
3364            invstack = hstack
3365
3366        axis_ids = set()
3367        invaxis_ids = set()
3368        invaxis_metadata = {}
3369
3370        all_tables = others[:]
3371        all_tables.insert(0, self)
3372
3373        # verify disjoint, and fetch all ids from all tables
3374        for table in all_tables:
3375            table_axis_ids = table.ids(axis=axis)
3376            table_invaxis_order = table.ids(axis=invaxis)
3377            table_invaxis = set(table_invaxis_order)
3378
3379            # test we have disjoint IDs
3380            if not axis_ids.isdisjoint(table_axis_ids):
3381                raise DisjointIDError("IDs are not disjoint")
3382            axis_ids.update(table_axis_ids)
3383
3384            if set(table_invaxis) - invaxis_ids:
3385                for i in (set(table_invaxis) - invaxis_ids):
3386                    invaxis_metadata[i] = table.metadata(i, axis=invaxis)
3387
3388                # add to our perspective all inv axis IDs
3389                invaxis_ids.update(table_invaxis)
3390
3391        invaxis_order = sorted(invaxis_ids)
3392
3393        # determine what inv axis IDs do not exist per table, and pad and sort
3394        # as necessary
3395        padded_tables = []
3396        for table in all_tables:
3397            missing_ids = list(invaxis_ids - set(table.ids(axis=invaxis)))
3398
3399            if missing_ids:
3400                # determine new shape
3401                n_invaxis = len(missing_ids)
3402                n_axis = len(table.ids(axis=axis))
3403                if axis == 'sample':
3404                    shape = (n_invaxis, n_axis)
3405                else:
3406                    shape = (n_axis, n_invaxis)
3407
3408                # create the padded matrix
3409                zerod = csr_matrix(shape)
3410                tmp_mat = invstack([table.matrix_data, zerod])
3411
3412                # resolve invert axis ids and metadata
3413                tmp_inv_ids = list(table.ids(axis=invaxis))
3414                tmp_inv_ids.extend(missing_ids)
3415                tmp_inv_md = table.metadata(axis=invaxis)
3416                if tmp_inv_md is None:
3417                    tmp_inv_md = [None] * len(table.ids(axis=invaxis))
3418                else:
3419                    tmp_inv_md = list(tmp_inv_md)
3420                tmp_inv_md.extend([invaxis_metadata[i] for i in missing_ids])
3421
3422                # resolve axis ids and metadata
3423                tmp_ids = list(table.ids(axis=axis))
3424                tmp_md = table.metadata(axis=axis)
3425
3426                # resolve construction based off axis. This really should be
3427                # pushed to a classmethod.
3428                if axis == 'sample':
3429                    tmp_table = self.__class__(tmp_mat, tmp_inv_ids, tmp_ids,
3430                                               tmp_inv_md, tmp_md)
3431                else:
3432                    tmp_table = self.__class__(tmp_mat, tmp_ids, tmp_inv_ids,
3433                                               tmp_md, tmp_inv_md)
3434            else:
3435                tmp_table = table
3436
3437            # sort the table if necessary
3438            if (tmp_table.ids(axis=invaxis) == invaxis_order).all():
3439                padded_tables.append(tmp_table)
3440            else:
3441                padded_tables.append(tmp_table.sort_order(invaxis_order,
3442                                                          axis=invaxis))
3443
3444        # actually concatenate the matrices, IDs and metadata
3445        concat_mat = stack([t.matrix_data for t in padded_tables])
3446        concat_ids = np.concatenate([t.ids(axis=axis) for t in padded_tables])
3447        concat_md = []
3448        for table in padded_tables:
3449            metadata = table.metadata(axis=axis)
3450            if metadata is None:
3451                metadata = [None] * dim_getter(table.shape)
3452            concat_md.extend(metadata)
3453
3454        # inverse axis sourced from whatever is in the first table
3455        inv_md = padded_tables[0].metadata(axis=invaxis)
3456        if axis == 'sample':
3457            concat = self.__class__(concat_mat, invaxis_order, concat_ids,
3458                                    inv_md, concat_md, type=self.type)
3459        else:
3460            concat = self.__class__(concat_mat, concat_ids, invaxis_order,
3461                                    concat_md, inv_md, type=self.type)
3462
3463        return concat
3464
3465    def _fast_merge(self, others):
3466        """For simple merge operations it is faster to aggregate using pandas
3467
3468        Parameters
3469        ----------
3470        others : Table, or Iterable of Table
3471            If a Table, then merge with that table. If an iterable, then merge
3472            all of the tables
3473        """
3474        tables = [self] + others
3475
3476        # gather all identifiers across tables
3477        all_features = reduce(or_, [set(t.ids(axis='observation'))
3478                                    for t in tables])
3479        all_samples = reduce(or_, [set(t.ids()) for t in tables])
3480
3481        # generate unique integer ids for the identifiers, and let's order
3482        # it to be polite
3483        feature_map = {i: idx for idx, i in enumerate(sorted(all_features))}
3484        sample_map = {i: idx for idx, i in enumerate(sorted(all_samples))}
3485
3486        # produce a new stable order
3487        get1 = lambda x: x[1]  # noqa
3488        feature_order = [k for k, v in sorted(feature_map.items(), key=get1)]
3489        sample_order = [k for k, v in sorted(sample_map.items(), key=get1)]
3490
3491        mi = []
3492        values = []
3493        for table in tables:
3494            # these data are effectively [((row_index, col_index), value), ]
3495            data_as_dok = table.matrix_data.todok()
3496
3497            # construct a map of the feature integer index to what it is in
3498            # the full table
3499            feat_ids = table.ids(axis='observation')
3500            samp_ids = table.ids()
3501            table_features = {idx: feature_map[i]
3502                              for idx, i in enumerate(feat_ids)}
3503            table_samples = {idx: sample_map[i]
3504                             for idx, i in enumerate(samp_ids)}
3505
3506            for (f, s), v in data_as_dok.items():
3507                # collect the indices and values, adjusting the indices as we
3508                # go
3509                mi.append((table_features[f], table_samples[s]))
3510                values.append(v)
3511
3512        # construct a multiindex of the indices where the outer index is the
3513        # feature and the inner index is the sample
3514        mi = pd.MultiIndex.from_tuples(mi)
3515        grouped = pd.Series(values, index=mi)
3516
3517        # aggregate the values where the outer and inner values in the
3518        # multiindex are the same
3519        collapsed_rcv = grouped.groupby(level=[0, 1]).sum()
3520
3521        # convert into a representation understood by the Table constructor
3522        list_list = [[r, c, v] for (r, c), v in collapsed_rcv.items()]
3523
3524        return self.__class__(list_list, feature_order, sample_order)
3525
3526    def merge(self, other, sample='union', observation='union',
3527              sample_metadata_f=prefer_self,
3528              observation_metadata_f=prefer_self):
3529        """Merge two tables together
3530
3531        The axes, samples and observations, can be controlled independently.
3532        Both can work on either "union" or "intersection".
3533
3534        `sample_metadata_f` and `observation_metadata_f` define how to
3535        merge metadata between tables. The default is to just keep the metadata
3536        associated to self if self has metadata otherwise take metadata from
3537        other. These functions are given both metadata dicts and must return
3538        a single metadata dict
3539
3540        Parameters
3541        ----------
3542        other : biom.Table or Iterable of Table
3543            The other table to merge with this one. If an iterable, the tables
3544            are expected to not have metadata.
3545        sample : {'union', 'intersection'}, optional
3546        observation : {'union', 'intersection'}, optional
3547        sample_metadata_f : function, optional
3548            Defaults to ``biom.util.prefer_self``. Defines how to handle sample
3549            metadata during merge.
3550        obesrvation_metadata_f : function, optional
3551            Defaults to ``biom.util.prefer_self``. Defines how to handle
3552            observation metdata during merge.
3553
3554        Returns
3555        -------
3556        biom.Table
3557            The merged table
3558
3559        Notes
3560        -----
3561        - If ``sample_metadata_f`` and ``observation_metadata_f`` are None,
3562            then a fast merge is applied.
3563        - There is an implicit type conversion to ``float``.
3564        - The return type is always that of ``self``
3565
3566        Examples
3567        --------
3568
3569        >>> import numpy as np
3570        >>> from biom.table import Table
3571
3572        Create a 2x2 table and a 3x2 table:
3573
3574        >>> d_a = np.asarray([[2, 0], [6, 1]])
3575        >>> t_a = Table(d_a, ['O1', 'O2'], ['S1', 'S2'])
3576        >>> d_b = np.asarray([[4, 5], [0, 3], [10, 10]])
3577        >>> t_b = Table(d_b, ['O1', 'O2', 'O3'], ['S1', 'S2'])
3578
3579        Merging the table results in the overlapping samples/observations (see
3580        `O1` and `S2`) to be summed and the non-overlapping ones to be added to
3581        the resulting table (see `S3`).
3582
3583        >>> merged_table = t_a.merge(t_b)
3584        >>> print(merged_table)  # doctest: +NORMALIZE_WHITESPACE
3585        # Constructed from biom file
3586        #OTU ID	S1	S2
3587        O1	6.0	5.0
3588        O2	6.0	4.0
3589        O3	10.0	10.0
3590
3591        """
3592        s_md = self.metadata()
3593        o_md = self.metadata(axis='observation')
3594        no_md = (s_md is None) and (o_md is None)
3595        ignore_md = (sample_metadata_f is None) and \
3596            (observation_metadata_f is None)
3597
3598        if no_md or ignore_md:
3599            if sample == 'union' and observation == 'union':
3600                if isinstance(other, (list, set, tuple)):
3601                    return self._fast_merge(other)
3602                else:
3603                    return self._fast_merge([other, ])
3604
3605        # determine the sample order in the resulting table
3606        if sample == 'union':
3607            new_samp_order = self._union_id_order(self.ids(), other.ids())
3608        elif sample == 'intersection':
3609            new_samp_order = self._intersect_id_order(self.ids(), other.ids())
3610        else:
3611            raise TableException("Unknown sample merge type: %s" % sample)
3612
3613        # determine the observation order in the resulting table
3614        if observation == 'union':
3615            new_obs_order = self._union_id_order(
3616                self.ids(axis='observation'), other.ids(axis='observation'))
3617        elif observation == 'intersection':
3618            new_obs_order = self._intersect_id_order(
3619                self.ids(axis='observation'), other.ids(axis='observation'))
3620        else:
3621            raise TableException(
3622                "Unknown observation merge type: %s" %
3623                observation)
3624
3625        # convert these to lists, no need to be dictionaries and reduces
3626        # calls to items() and allows for pre-caluculating insert order
3627        new_samp_order = sorted(new_samp_order.items(), key=itemgetter(1))
3628        new_obs_order = sorted(new_obs_order.items(), key=itemgetter(1))
3629
3630        # if we don't have any samples, complain loudly. This is likely from
3631        # performing an intersection without overlapping ids
3632        if not new_samp_order:
3633            raise TableException("No samples in resulting table!")
3634        if not new_obs_order:
3635            raise TableException("No observations in resulting table!")
3636
3637        # helper index lookups
3638        other_obs_idx = other._obs_index
3639        self_obs_idx = self._obs_index
3640        other_samp_idx = other._sample_index
3641        self_samp_idx = self._sample_index
3642
3643        # pre-calculate sample order from each table. We only need to do this
3644        # once which dramatically reduces the number of dict lookups necessary
3645        # within the inner loop
3646        other_samp_order = []
3647        self_samp_order = []
3648        for samp_id, nsi in new_samp_order:  # nsi -> new_sample_index
3649            other_samp_order.append((nsi, other_samp_idx.get(samp_id, None)))
3650            self_samp_order.append((nsi, self_samp_idx.get(samp_id, None)))
3651
3652        # pre-allocate the a list for placing the resulting vectors as the
3653        # placement id is not ordered
3654        vals = [None for i in range(len(new_obs_order))]
3655
3656        # POSSIBLE DECOMPOSITION
3657        # resulting sample ids and sample metadata
3658        sample_ids = []
3659        sample_md = []
3660        self_sample_md = self.metadata()
3661        other_sample_md = other.metadata()
3662        for id_, idx in new_samp_order:
3663            sample_ids.append(id_)
3664
3665            # if we have sample metadata, grab it
3666            if self_sample_md is None or not self.exists(id_):
3667                self_md = None
3668            else:
3669                self_md = self_sample_md[self_samp_idx[id_]]
3670
3671            # if we have sample metadata, grab it
3672            if other_sample_md is None or not other.exists(id_):
3673                other_md = None
3674            else:
3675                other_md = other_sample_md[other_samp_idx[id_]]
3676
3677            sample_md.append(sample_metadata_f(self_md, other_md))
3678
3679        # POSSIBLE DECOMPOSITION
3680        # resulting observation ids and sample metadata
3681        obs_ids = []
3682        obs_md = []
3683        self_obs_md = self.metadata(axis='observation')
3684        other_obs_md = other.metadata(axis='observation')
3685        for id_, idx in new_obs_order:
3686            obs_ids.append(id_)
3687
3688            # if we have observation metadata, grab it
3689            if self_obs_md is None or not self.exists(id_, axis="observation"):
3690                self_md = None
3691            else:
3692                self_md = self_obs_md[self_obs_idx[id_]]
3693
3694            # if we have observation metadata, grab it
3695            if other_obs_md is None or \
3696                    not other.exists(id_, axis="observation"):
3697                other_md = None
3698            else:
3699                other_md = other_obs_md[other_obs_idx[id_]]
3700
3701            obs_md.append(observation_metadata_f(self_md, other_md))
3702
3703        # length used for construction of new vectors
3704        vec_length = len(new_samp_order)
3705
3706        # walk over observations in our new order
3707        for obs_id, new_obs_idx in new_obs_order:
3708            # create new vector for matrix values
3709            new_vec = zeros(vec_length, dtype='float')
3710
3711            # This method allows for the creation of a matrix of self type.
3712            # See note above
3713            # new_vec = data_f()
3714
3715            # see if the observation exists in other, if so, pull it out.
3716            # if not, set to the placeholder missing
3717            if other.exists(obs_id, axis="observation"):
3718                other_vec = other.data(obs_id, 'observation')
3719            else:
3720                other_vec = None
3721
3722            # see if the observation exists in self, if so, pull it out.
3723            # if not, set to the placeholder missing
3724            if self.exists(obs_id, axis="observation"):
3725                self_vec = self.data(obs_id, 'observation')
3726            else:
3727                self_vec = None
3728
3729            # short circuit. If other doesn't have any values, then we can just
3730            # take all values from self
3731            if other_vec is None:
3732                for (n_idx, s_idx) in self_samp_order:
3733                    if s_idx is not None:
3734                        new_vec[n_idx] = self_vec[s_idx]
3735
3736            # short circuit. If self doesn't have any values, then we can just
3737            # take all values from other
3738            elif self_vec is None:
3739                for (n_idx, o_idx) in other_samp_order:
3740                    if o_idx is not None:
3741                        new_vec[n_idx] = other_vec[o_idx]
3742
3743            else:
3744                # NOTE: DM 7.5.12, no observed improvement at the profile level
3745                # was made on this inner loop by using self_samp_order and
3746                # other_samp_order lists.
3747
3748                # walk over samples in our new order
3749                for samp_id, new_samp_idx in new_samp_order:
3750                    # pull out each individual sample value. This is expensive,
3751                    # but the vectors are in a different alignment. It is
3752                    # possible that this could be improved with numpy take but
3753                    # needs to handle missing values appropriately
3754                    if samp_id not in self_samp_idx:
3755                        self_vec_value = 0
3756                    else:
3757                        self_vec_value = self_vec[self_samp_idx[samp_id]]
3758
3759                    if samp_id not in other_samp_idx:
3760                        other_vec_value = 0
3761                    else:
3762                        other_vec_value = other_vec[other_samp_idx[samp_id]]
3763
3764                    new_vec[new_samp_idx] = self_vec_value + other_vec_value
3765
3766            # convert our new vector to self type as to make sure we don't
3767            # accidently force a dense representation in memory
3768            vals[new_obs_idx] = self._conv_to_self_type(new_vec)
3769
3770        return self.__class__(self._conv_to_self_type(vals), obs_ids[:],
3771                              sample_ids[:], obs_md, sample_md)
3772
3773    @classmethod
3774    def from_hdf5(cls, h5grp, ids=None, axis='sample', parse_fs=None,
3775                  subset_with_metadata=True):
3776        """Parse an HDF5 formatted BIOM table
3777
3778        If ids is provided, only the samples/observations listed in ids
3779        (depending on the value of axis) will be loaded
3780
3781        The expected structure of this group is below. A few basic definitions,
3782        N is the number of observations and M is the number of samples. Data
3783        are stored in both compressed sparse row (for observation oriented
3784        operations) and compressed sparse column (for sample oriented
3785        operations).
3786
3787        Notes
3788        -----
3789        The expected HDF5 group structure is below. An example of an HDF5 file
3790        in DDL can be found here [3]_.
3791
3792        - ./id                                                  : str, an \
3793arbitrary ID
3794        - ./type                                                : str, the \
3795table type (e.g, OTU table)
3796        - ./format-url                                          : str, a URL \
3797that describes the format
3798        - ./format-version                                      : two element \
3799tuple of int32, major and minor
3800        - ./generated-by                                        : str, what \
3801generated this file
3802        - ./creation-date                                       : str, ISO \
3803format
3804        - ./shape                                               : two element \
3805tuple of int32, N by M
3806        - ./nnz                                                 : int32 or \
3807int64, number of non zero elems
3808        - ./observation                                         : Group
3809        - ./observation/ids                                     : (N,) dataset\
3810 of str or vlen str
3811        - ./observation/matrix                                  : Group
3812        - ./observation/matrix/data                             : (nnz,) \
3813dataset of float64
3814        - ./observation/matrix/indices                          : (nnz,) \
3815dataset of int32
3816        - ./observation/matrix/indptr                           : (M+1,) \
3817dataset of int32
3818        - ./observation/metadata                                : Group
3819        - [./observation/metadata/foo]                          : Optional, \
3820(N,) dataset of any valid HDF5 type in index order with IDs.
3821        - ./observation/group-metadata                          : Group
3822        - [./observation/group-metadata/foo]                    : Optional, \
3823(?,) dataset of group metadata that relates IDs
3824        - [./observation/group-metadata/foo.attrs['data_type']] : attribute of\
3825 the foo dataset that describes contained type (e.g., newick)
3826        - ./sample                                              : Group
3827        - ./sample/ids                                          : (M,) dataset\
3828 of str or vlen str
3829        - ./sample/matrix                                       : Group
3830        - ./sample/matrix/data                                  : (nnz,) \
3831dataset of float64
3832        - ./sample/matrix/indices                               : (nnz,) \
3833dataset of int32
3834        - ./sample/matrix/indptr                                : (N+1,) \
3835dataset of int32
3836        - ./sample/metadata                                     : Group
3837        - [./sample/metadata/foo]                               : Optional, \
3838(M,) dataset of any valid HDF5 type in index order with IDs.
3839        - ./sample/group-metadata                               : Group
3840        - [./sample/group-metadata/foo]                         : Optional, \
3841(?,) dataset of group metadata that relates IDs
3842        - [./sample/group-metadata/foo.attrs['data_type']]      : attribute of\
3843 the foo dataset that describes contained type (e.g., newick)
3844
3845        The '?' character on the dataset size means that it can be of arbitrary
3846        length.
3847
3848        The expected structure for each of the metadata datasets is a list of
3849        atomic type objects (int, float, str, ...), where the index order of
3850        the list corresponds to the index order of the relevant axis IDs.
3851        Special metadata fields have been defined, and they are stored in a
3852        specific way. Currently, the available special metadata fields are:
3853
3854        - taxonomy: (N, ?) dataset of str or vlen str
3855        - KEGG_Pathways: (N, ?) dataset of str or vlen str
3856        - collapsed_ids: (N, ?) dataset of str or vlen str
3857
3858        Parameters
3859        ----------
3860        h5grp : a h5py ``Group`` or an open h5py ``File``
3861        ids : iterable
3862            The sample/observation ids of the samples/observations that we need
3863            to retrieve from the hdf5 biom table
3864        axis : {'sample', 'observation'}, optional
3865            The axis to subset on
3866        parse_fs : dict, optional
3867            Specify custom parsing functions for metadata fields. This dict is
3868            expected to be {'metadata_field': function}, where the function
3869            signature is (object) corresponding to a single row in the
3870            associated metadata dataset. The return from this function an
3871            object as well, and is the parsed representation of the metadata.
3872        subset_with_metadata : bool, optional
3873            When subsetting (i.e., `ids` is `not None`), whether to also parse
3874            the metadata. By default, the metadata are also subset. The reason
3875            for exposing this functionality is that, for large tables, there
3876            exists a very large overhead for this metadata manipulation.
3877
3878        Returns
3879        -------
3880        biom.Table
3881            A BIOM ``Table`` object
3882
3883        Raises
3884        ------
3885        ValueError
3886            If `ids` are not a subset of the samples or observations ids
3887            present in the hdf5 biom table
3888            If h5grp is not a HDF5 file or group
3889
3890        References
3891        ----------
3892        .. [1] http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/sci\
3893py.sparse.csr_matrix.html
3894        .. [2] http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/sci\
3895py.sparse.csc_matrix.html
3896        .. [3] http://biom-format.org/documentation/format_versions/biom-2.0.\
3897html
3898
3899        See Also
3900        --------
3901        Table.to_hdf5
3902
3903        Examples
3904        --------
3905        >>> from biom.table import Table
3906        >>> from biom.util import biom_open
3907        >>> with biom_open('rich_sparse_otu_table_hdf5.biom') as f \
3908# doctest: +SKIP
3909        >>>     t = Table.from_hdf5(f) # doctest: +SKIP
3910
3911        Parse a hdf5 biom table subsetting observations
3912        >>> from biom.util import biom_open # doctest: +SKIP
3913        >>> from biom.parse import parse_biom_table
3914        >>> with biom_open('rich_sparse_otu_table_hdf5.biom') as f \
3915# doctest: +SKIP
3916        >>>     t = Table.from_hdf5(f, ids=["GG_OTU_1"],
3917        ...                         axis='observation') # doctest: +SKIP
3918        """
3919        if not HAVE_H5PY:
3920            raise RuntimeError("h5py is not in the environment, HDF5 support "
3921                               "is not available")
3922
3923        import h5py
3924        if not isinstance(h5grp, (h5py.Group, h5py.File)):
3925            raise ValueError("h5grp does not appear to be an HDF5 file or "
3926                             "group")
3927
3928        if axis not in ['sample', 'observation']:
3929            raise UnknownAxisError(axis)
3930
3931        if parse_fs is None:
3932            parse_fs = {}
3933
3934        if not subset_with_metadata and ids is not None:
3935            ids = set(ids)
3936
3937            raw_indices = h5grp['%s/matrix/indices' % axis]
3938            raw_indptr = h5grp['%s/matrix/indptr' % axis]
3939            raw_data = h5grp['%s/matrix/data' % axis]
3940            axis_ids = h5grp['%s/ids' % axis][:]
3941
3942            to_keep = np.array([i for i, id_ in enumerate(axis_ids)
3943                                if id_ in ids], dtype=int)
3944            start_end = [(raw_indptr[i], raw_indptr[i+1]) for i in to_keep]
3945            indptr = np.empty(len(to_keep) + 1, dtype=np.int32)
3946            indptr[0] = 0
3947            indptr[1:] = np.array([e - s for s, e in start_end]).cumsum()
3948            data = np.concatenate([raw_data[s:e] for s, e in start_end])
3949            indices = np.concatenate([raw_indices[s:e] for s, e in start_end])
3950
3951            if axis == 'sample':
3952                obs_ids = h5grp['observation/ids'][:]
3953                samp_ids = axis_ids[to_keep]
3954                shape = (len(obs_ids), len(to_keep))
3955                mat = csc_matrix((data, indices, indptr), shape=shape)
3956            else:
3957                samp_ids = h5grp['sample/ids'][:]
3958                obs_ids = axis_ids[to_keep]
3959                shape = (len(to_keep), len(samp_ids))
3960                mat = csr_matrix((data, indices, indptr), shape=shape)
3961
3962            return Table(mat, obs_ids, samp_ids)
3963
3964        id_ = h5grp.attrs['id']
3965        create_date = h5grp.attrs['creation-date']
3966        generated_by = h5grp.attrs['generated-by']
3967
3968        shape = h5grp.attrs['shape']
3969        type_ = None if h5grp.attrs['type'] == '' else h5grp.attrs['type']
3970
3971        if isinstance(id_, six.binary_type):
3972            if six.PY3:
3973                id_ = id_.decode('ascii')
3974            else:
3975                id_ = str(id_)
3976
3977        if isinstance(type_, six.binary_type):
3978            if six.PY3:
3979                type_ = type_.decode('ascii')
3980            else:
3981                type_ = str(type_)
3982
3983        def axis_load(grp):
3984            """Loads all the data of the given group"""
3985            # fetch all of the IDs
3986            ids = grp['ids'][:]
3987
3988            if ids.size > 0 and isinstance(ids[0], bytes):
3989                ids = np.array([i.decode('utf8') for i in ids])
3990
3991            parser = defaultdict(lambda: general_parser)
3992            parser['taxonomy'] = vlen_list_of_str_parser
3993            parser['KEGG_Pathways'] = vlen_list_of_str_parser
3994            parser['collapsed_ids'] = vlen_list_of_str_parser
3995            parser.update(parse_fs)
3996
3997            # fetch ID specific metadata
3998            md = [{} for i in range(len(ids))]
3999            for category, dset in viewitems(grp['metadata']):
4000                parse_f = parser[category]
4001                data = dset[:]
4002                for md_dict, data_row in zip(md, data):
4003                    md_dict[category] = parse_f(data_row)
4004
4005            # If there was no metadata on the axis, set it up as none
4006            md = md if any(md) else None
4007
4008            # Fetch the group metadata
4009            grp_md = {cat: val
4010                      for cat, val in grp['group-metadata'].items()}
4011            return ids, md, grp_md
4012
4013        obs_ids, obs_md, obs_grp_md = axis_load(h5grp['observation'])
4014        samp_ids, samp_md, samp_grp_md = axis_load(h5grp['sample'])
4015
4016        # load the data
4017        data_grp = h5grp[axis]['matrix']
4018        h5_data = data_grp["data"]
4019        h5_indices = data_grp["indices"]
4020        h5_indptr = data_grp["indptr"]
4021
4022        # Check if we need to subset the biom table
4023        if ids is not None:
4024            def _get_ids(source_ids, desired_ids):
4025                """If desired_ids is not None, makes sure that it is a subset
4026                of source_ids and returns the desired_ids array-like and a
4027                boolean array indicating where the desired_ids can be found in
4028                source_ids"""
4029                if desired_ids is None:
4030                    ids = source_ids[:]
4031                    idx = np.ones(source_ids.shape, dtype=bool)
4032                else:
4033                    desired_ids = np.asarray(desired_ids)
4034                    # Get the index of the source ids to include
4035                    idx = np.in1d(source_ids, desired_ids)
4036                    # Retrieve only the ids that we are interested on
4037                    ids = source_ids[idx]
4038                    # Check that all desired ids have been found on source ids
4039
4040                    if ids.shape != desired_ids.shape:
4041                        raise ValueError("The following ids could not be "
4042                                         "found in the biom table: %s" %
4043                                         (set(desired_ids) - set(ids)))
4044                return ids, idx
4045
4046            # Get the observation and sample ids that we are interested in
4047            samp, obs = (ids, None) if axis == 'sample' else (None, ids)
4048            obs_ids, obs_idx = _get_ids(obs_ids, obs)
4049            samp_ids, samp_idx = _get_ids(samp_ids, samp)
4050
4051            # Get the new matrix shape
4052            shape = (len(obs_ids), len(samp_ids))
4053
4054            # Fetch the metadata that we are interested in
4055            def _subset_metadata(md, idx):
4056                """If md has data, returns the subset indicated by idx, a
4057                boolean array"""
4058                if md:
4059                    md = list(np.asarray(md)[np.where(idx)])
4060                return md
4061
4062            obs_md = _subset_metadata(obs_md, obs_idx)
4063            samp_md = _subset_metadata(samp_md, samp_idx)
4064
4065            # load the subset of the data
4066            idx = samp_idx if axis == 'sample' else obs_idx
4067            keep = np.where(idx)[0]
4068            indptr_indices = sorted([(h5_indptr[i], h5_indptr[i+1])
4069                                     for i in keep])
4070            # Create the new indptr
4071            indptr_subset = np.array([end - start
4072                                      for start, end in indptr_indices])
4073            indptr = np.empty(len(keep) + 1, dtype=np.int32)
4074            indptr[0] = 0
4075            indptr[1:] = indptr_subset.cumsum()
4076
4077            data = np.hstack(h5_data[start:end]
4078                             for start, end in indptr_indices)
4079            indices = np.hstack(h5_indices[start:end]
4080                                for start, end in indptr_indices)
4081        else:
4082            # no subset need, just pass all data to scipy
4083            data = h5_data
4084            indices = h5_indices
4085            indptr = h5_indptr
4086
4087        cs = (data, indices, indptr)
4088
4089        if axis == 'sample':
4090            matrix = csc_matrix(cs, shape=shape)
4091        else:
4092            matrix = csr_matrix(cs, shape=shape)
4093
4094        t = Table(matrix, obs_ids, samp_ids, obs_md or None,
4095                  samp_md or None, type=type_, create_date=create_date,
4096                  generated_by=generated_by, table_id=id_,
4097                  observation_group_metadata=obs_grp_md,
4098                  sample_group_metadata=samp_grp_md)
4099
4100        if ids is not None:
4101            # filter out any empty samples or observations which may exist due
4102            # to subsetting
4103            def any_value(vals, id_, md):
4104                return np.any(vals)
4105
4106            axis = 'observation' if axis == 'sample' else 'sample'
4107            t.filter(any_value, axis=axis)
4108
4109        return t
4110
4111    def to_dataframe(self, dense=False):
4112        """Convert matrix data to a Pandas SparseDataFrame or DataFrame
4113
4114        Parameters
4115        ----------
4116        dense : bool, optional
4117            If True, return pd.DataFrame instead of pd.SparseDataFrame.
4118
4119        Returns
4120        -------
4121        pd.DataFrame or pd.SparseDataFrame
4122            A DataFrame indexed on the observation IDs, with the column
4123            names as the sample IDs.
4124
4125        Notes
4126        -----
4127        Metadata are not included.
4128
4129        Examples
4130        --------
4131        >>> from biom import example_table
4132        >>> df = example_table.to_dataframe()
4133        >>> df
4134             S1   S2   S3
4135        O1  0.0  1.0  2.0
4136        O2  3.0  4.0  5.0
4137        """
4138        index = self.ids(axis='observation')
4139        columns = self.ids()
4140
4141        if dense:
4142            mat = self.matrix_data.toarray()
4143            constructor = pd.DataFrame
4144        else:
4145            mat = self.matrix_data.copy()
4146            constructor = partial(pd.DataFrame.sparse.from_spmatrix)
4147
4148        return constructor(mat, index=index, columns=columns)
4149
4150    def to_anndata(self, dense=False, dtype="float32", transpose=True):
4151        """Convert Table to AnnData format
4152
4153        Parameters
4154        ----------
4155        dense : bool, optional
4156            If True, set adata.X as np.ndarray instead of sparse matrix.
4157        dtype: str, optional
4158            dtype used for storage in anndata object.
4159        tranpose: bool, optional
4160            If True, transpose the anndata so that observations are columns
4161
4162        Returns
4163        -------
4164        anndata.AnnData
4165            AnnData with matrix data and associated observation and
4166            sample metadata.
4167
4168        Notes
4169        -----
4170        Nested metadata are not included.
4171
4172        Examples
4173        --------
4174        >>> from biom import example_table
4175        >>> adata = example_table.to_anndata()
4176        >>> adata
4177        AnnData object with n_obs × n_vars = 3 × 2
4178            obs: 'environment'
4179            var: 'taxonomy_0', 'taxonomy_1'
4180        """
4181        try:
4182            import anndata
4183        except ImportError:
4184            raise ImportError(
4185                "Please install anndata package -- `pip install anndata`"
4186            )
4187        mat = self.matrix_data
4188
4189        if dense:
4190            mat = mat.toarray()
4191
4192        var = self.metadata_to_dataframe("sample")
4193        obs = self.metadata_to_dataframe("observation")
4194
4195        adata = anndata.AnnData(mat, obs=obs, var=var, dtype=dtype)
4196        # Convention for scRNA-seq analysis in Python
4197        adata = adata.transpose()
4198
4199        return adata
4200
4201    def metadata_to_dataframe(self, axis):
4202        """Convert axis metadata to a Pandas DataFrame
4203
4204        Parameters
4205        ----------
4206        axis : {'sample', 'observation'}
4207            The axis to operate on.
4208
4209        Returns
4210        -------
4211        pd.DataFrame
4212            A DataFrame indexed by the ids of the desired axis, columns by the
4213            metadata keys over that axis.
4214
4215        Raises
4216        ------
4217        UnknownAxisError
4218            If the requested axis isn't recognized
4219        KeyError
4220            IF the requested axis does not have metadata
4221        TypeError
4222            If a metadata column is a list or tuple, but is jagged over the
4223            axis.
4224
4225        Notes
4226        -----
4227        Nested metadata (e.g., KEGG_Pathways) is not supported.
4228
4229        Metadata which are lists or tuples (e.g., taxonomy) are expanded such
4230        that each index position is a unique column. For instance, the key
4231        taxonomy will become "taxonomy_0", "taxonomy_1", etc where "taxonomy_0"
4232        corresponds to the 0th index position of the taxonomy.
4233
4234        Examples
4235        --------
4236        >>> from biom import example_table
4237        >>> example_table.metadata_to_dataframe('observation')
4238           taxonomy_0     taxonomy_1
4239        O1   Bacteria     Firmicutes
4240        O2   Bacteria  Bacteroidetes
4241        """
4242        md = self.metadata(axis=axis)
4243        if md is None:
4244            raise KeyError("%s does not have metadata" % axis)
4245
4246        test = md[0]
4247        kv_test = sorted(list(test.items()))
4248        columns = []
4249        expand = {}
4250        for key, value in kv_test:
4251            if isinstance(value, (tuple, list)):
4252                expand[key] = True
4253                for idx in range(len(value)):
4254                    columns.append("%s_%d" % (key, idx))
4255            else:
4256                expand[key] = False
4257                columns.append(key)
4258
4259        rows = []
4260        for m in md:
4261            row = []
4262            for key, value in sorted(m.items()):
4263                if expand[key]:
4264                    for v in value:
4265                        row.append(v)
4266                else:
4267                    row.append(value)
4268            rows.append(row)
4269
4270        return pd.DataFrame(rows, index=self.ids(axis=axis), columns=columns)
4271
4272    def to_hdf5(self, h5grp, generated_by, compress=True, format_fs=None):
4273        """Store CSC and CSR in place
4274
4275        The resulting structure of this group is below. A few basic
4276        definitions, N is the number of observations and M is the number of
4277        samples. Data are stored in both compressed sparse row [1]_ (CSR, for
4278        observation oriented operations) and compressed sparse column [2]_
4279        (CSC, for sample oriented operations).
4280
4281        Notes
4282        -----
4283        The expected HDF5 group structure is below. An example of an HDF5 file
4284        in DDL can be found here [3]_.
4285
4286        - ./id                                                  : str, an \
4287arbitrary ID
4288        - ./type                                                : str, the \
4289table type (e.g, OTU table)
4290        - ./format-url                                          : str, a URL \
4291that describes the format
4292        - ./format-version                                      : two element \
4293tuple of int32, major and minor
4294        - ./generated-by                                        : str, what \
4295generated this file
4296        - ./creation-date                                       : str, ISO \
4297format
4298        - ./shape                                               : two element \
4299tuple of int32, N by M
4300        - ./nnz                                                 : int32 or \
4301int64, number of non zero elems
4302        - ./observation                                         : Group
4303        - ./observation/ids                                     : (N,) dataset\
4304 of str or vlen str
4305        - ./observation/matrix                                  : Group
4306        - ./observation/matrix/data                             : (nnz,) \
4307dataset of float64
4308        - ./observation/matrix/indices                          : (nnz,) \
4309dataset of int32
4310        - ./observation/matrix/indptr                           : (M+1,) \
4311dataset of int32
4312        - ./observation/metadata                                : Group
4313        - [./observation/metadata/foo]                          : Optional, \
4314(N,) dataset of any valid HDF5 type in index order with IDs.
4315        - ./observation/group-metadata                          : Group
4316        - [./observation/group-metadata/foo]                    : Optional, \
4317(?,) dataset of group metadata that relates IDs
4318        - [./observation/group-metadata/foo.attrs['data_type']] : attribute of\
4319 the foo dataset that describes contained type (e.g., newick)
4320        - ./sample                                              : Group
4321        - ./sample/ids                                          : (M,) dataset\
4322 of str or vlen str
4323        - ./sample/matrix                                       : Group
4324        - ./sample/matrix/data                                  : (nnz,) \
4325dataset of float64
4326        - ./sample/matrix/indices                               : (nnz,) \
4327dataset of int32
4328        - ./sample/matrix/indptr                                : (N+1,) \
4329dataset of int32
4330        - ./sample/metadata                                     : Group
4331        - [./sample/metadata/foo]                               : Optional, \
4332(M,) dataset of any valid HDF5 type in index order with IDs.
4333        - ./sample/group-metadata                               : Group
4334        - [./sample/group-metadata/foo]                         : Optional, \
4335(?,) dataset of group metadata that relates IDs
4336        - [./sample/group-metadata/foo.attrs['data_type']]      : attribute of\
4337 the foo dataset that describes contained type (e.g., newick)
4338
4339        The '?' character on the dataset size means that it can be of arbitrary
4340        length.
4341
4342        The expected structure for each of the metadata datasets is a list of
4343        atomic type objects (int, float, str, ...), where the index order of
4344        the list corresponds to the index order of the relevant axis IDs.
4345        Special metadata fields have been defined, and they are stored in a
4346        specific way. Currently, the available special metadata fields are:
4347
4348        - taxonomy: (N, ?) dataset of str or vlen str
4349        - KEGG_Pathways: (N, ?) dataset of str or vlen str
4350        - collapsed_ids: (N, ?) dataset of str or vlen str
4351
4352        Parameters
4353        ----------
4354        h5grp : `h5py.Group` or `h5py.File`
4355            The HDF5 entity in which to write the BIOM formatted data.
4356        generated_by : str
4357            A description of what generated the table
4358        compress : bool, optional
4359            Defaults to ``True`` means fields will be compressed with gzip,
4360            ``False`` means no compression
4361        format_fs : dict, optional
4362            Specify custom formatting functions for metadata fields. This dict
4363            is expected to be {'metadata_field': function}, where the function
4364            signature is (h5py.Group, str, dict, bool) corresponding to the
4365            specific HDF5 group the metadata dataset will be associated with,
4366            the category being operated on, the metadata for the entire axis
4367            being operated on, and whether to enable compression on the
4368            dataset.  Anything returned by this function is ignored.
4369
4370        Notes
4371        -----
4372        This method does not return anything and operates in place on h5grp.
4373
4374        See Also
4375        --------
4376        Table.from_hdf5
4377
4378        References
4379        ----------
4380        .. [1] http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/sci\
4381py.sparse.csr_matrix.html
4382        .. [2] http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/sci\
4383py.sparse.csc_matrix.html
4384        .. [3] http://biom-format.org/documentation/format_versions/biom-2.1.\
4385html
4386
4387        Examples
4388        --------
4389        >>> from biom.util import biom_open  # doctest: +SKIP
4390        >>> from biom.table import Table
4391        >>> from numpy import array
4392        >>> t = Table(array([[1, 2], [3, 4]]), ['a', 'b'], ['x', 'y'])
4393        >>> with biom_open('foo.biom', 'w') as f:  # doctest: +SKIP
4394        ...     t.to_hdf5(f, "example")
4395
4396        """
4397        if not HAVE_H5PY:
4398            raise RuntimeError("h5py is not in the environment, HDF5 support "
4399                               "is not available")
4400
4401        if format_fs is None:
4402            format_fs = {}
4403
4404        h5grp.attrs['id'] = self.table_id if self.table_id else "No Table ID"
4405        h5grp.attrs['type'] = self.type if self.type else ""
4406        h5grp.attrs['format-url'] = "http://biom-format.org"
4407        h5grp.attrs['format-version'] = self.format_version
4408        h5grp.attrs['generated-by'] = generated_by
4409        h5grp.attrs['creation-date'] = datetime.now().isoformat()
4410        h5grp.attrs['shape'] = self.shape
4411        h5grp.attrs['nnz'] = self.nnz
4412
4413        compression = None
4414        if compress is True:
4415            compression = 'gzip'
4416
4417        formatter = defaultdict(lambda: general_formatter)
4418        formatter['taxonomy'] = vlen_list_of_str_formatter
4419        formatter['KEGG_Pathways'] = vlen_list_of_str_formatter
4420        formatter['collapsed_ids'] = vlen_list_of_str_formatter
4421        formatter.update(format_fs)
4422
4423        for axis, order in zip(['observation', 'sample'], ['csr', 'csc']):
4424            grp = h5grp.create_group(axis)
4425
4426            self._data = self._data.asformat(order)
4427
4428            ids = self.ids(axis=axis)
4429            len_ids = len(ids)
4430            len_indptr = len(self._data.indptr)
4431            len_data = self.nnz
4432
4433            md = self.metadata(axis=axis)
4434
4435            # Create the group for the metadata
4436            grp.create_group('metadata')
4437            if md:
4438                exp = set(md[0])
4439                for other_id, other_md in zip(ids[1:], md[1:]):
4440                    if set(other_md) != exp:
4441                        raise ValueError("%s has inconsistent metadata "
4442                                         "categories with %s:\n"
4443                                         "%s: %s\n"
4444                                         "%s: %s" % (other_id, ids[0],
4445                                                     other_id, list(other_md),
4446                                                     ids[0], list(exp)))
4447
4448                for category in md[0]:
4449                    # Create the dataset for the current category,
4450                    # putting values in id order
4451                    formatter[category](grp, category, md, compression)
4452
4453            group_md = self.group_metadata(axis)
4454
4455            # Create the group for the group metadata
4456            grp.create_group('group-metadata')
4457
4458            if group_md:
4459                for key, value in group_md.items():
4460                    datatype, val = value
4461                    grp_dataset = grp.create_dataset(
4462                        'group-metadata/%s' % key,
4463                        shape=(1,), dtype=H5PY_VLEN_STR,
4464                        data=val, compression=compression)
4465                    grp_dataset.attrs['data_type'] = datatype
4466
4467            grp.create_group('matrix')
4468            grp.create_dataset('matrix/data', shape=(len_data,),
4469                               dtype=np.float64,
4470                               data=self._data.data,
4471                               compression=compression)
4472            grp.create_dataset('matrix/indices', shape=(len_data,),
4473                               dtype=np.int32,
4474                               data=self._data.indices,
4475                               compression=compression)
4476            grp.create_dataset('matrix/indptr', shape=(len_indptr,),
4477                               dtype=np.int32,
4478                               data=self._data.indptr,
4479                               compression=compression)
4480
4481            if len_ids > 0:
4482                # if we store IDs in the table as numpy arrays then this store
4483                # is cleaner, as is the parse
4484                grp.create_dataset('ids', shape=(len_ids,),
4485                                   dtype=H5PY_VLEN_STR,
4486                                   data=[i.encode('utf8') for i in ids],
4487                                   compression=compression)
4488            else:
4489                # Empty H5PY_VLEN_STR datasets are not supported.
4490                grp.create_dataset('ids', shape=(0, ), data=[],
4491                                   compression=compression)
4492
4493    @classmethod
4494    def from_json(self, json_table, data_pump=None,
4495                  input_is_dense=False):
4496        """Parse a biom otu table type
4497
4498        Parameters
4499        ----------
4500        json_table : dict
4501            A JSON object or dict that represents the BIOM table
4502        data_pump : tuple or None
4503            A secondary source of data
4504        input_is_dense : bool
4505            If `True`, the data contained will be interpretted as dense
4506
4507        Returns
4508        -------
4509        Table
4510
4511        Examples
4512        --------
4513        >>> from biom import Table
4514        >>> json_obj = {"id": "None",
4515        ...             "format": "Biological Observation Matrix 1.0.0",
4516        ...             "format_url": "http://biom-format.org",
4517        ...             "generated_by": "foo",
4518        ...             "type": "OTU table",
4519        ...             "date": "2014-06-03T14:24:40.884420",
4520        ...             "matrix_element_type": "float",
4521        ...             "shape": [5, 6],
4522        ...             "data": [[0,2,1.0],
4523        ...                      [1,0,5.0],
4524        ...                      [1,1,1.0],
4525        ...                      [1,3,2.0],
4526        ...                      [1,4,3.0],
4527        ...                      [1,5,1.0],
4528        ...                      [2,2,1.0],
4529        ...                      [2,3,4.0],
4530        ...                      [2,5,2.0],
4531        ...                      [3,0,2.0],
4532        ...                      [3,1,1.0],
4533        ...                      [3,2,1.0],
4534        ...                      [3,5,1.0],
4535        ...                      [4,1,1.0],
4536        ...                      [4,2,1.0]],
4537        ...             "rows": [{"id": "GG_OTU_1", "metadata": None},
4538        ...                      {"id": "GG_OTU_2", "metadata": None},
4539        ...                      {"id": "GG_OTU_3", "metadata": None},
4540        ...                      {"id": "GG_OTU_4", "metadata": None},
4541        ...                      {"id": "GG_OTU_5", "metadata": None}],
4542        ...             "columns": [{"id": "Sample1", "metadata": None},
4543        ...                         {"id": "Sample2", "metadata": None},
4544        ...                         {"id": "Sample3", "metadata": None},
4545        ...                         {"id": "Sample4", "metadata": None},
4546        ...                         {"id": "Sample5", "metadata": None},
4547        ...                         {"id": "Sample6", "metadata": None}]
4548        ...             }
4549        >>> t = Table.from_json(json_obj)
4550
4551        """
4552        sample_ids = [col['id'] for col in json_table['columns']]
4553        sample_metadata = [col['metadata'] for col in json_table['columns']]
4554        obs_ids = [row['id'] for row in json_table['rows']]
4555        obs_metadata = [row['metadata'] for row in json_table['rows']]
4556        dtype = MATRIX_ELEMENT_TYPE[json_table['matrix_element_type']]
4557        if 'matrix_type' in json_table:
4558            if json_table['matrix_type'] == 'dense':
4559                input_is_dense = True
4560            else:
4561                input_is_dense = False
4562        type_ = json_table['type']
4563
4564        if data_pump is None:
4565            table_obj = Table(json_table['data'], obs_ids, sample_ids,
4566                              obs_metadata, sample_metadata,
4567                              shape=json_table['shape'],
4568                              dtype=dtype,
4569                              type=type_,
4570                              generated_by=json_table['generated_by'],
4571                              input_is_dense=input_is_dense)
4572        else:
4573            table_obj = Table(data_pump, obs_ids, sample_ids,
4574                              obs_metadata, sample_metadata,
4575                              shape=json_table['shape'],
4576                              dtype=dtype,
4577                              type=type_,
4578                              generated_by=json_table['generated_by'],
4579                              input_is_dense=input_is_dense)
4580        return table_obj
4581
4582    def to_json(self, generated_by, direct_io=None):
4583        """Returns a JSON string representing the table in BIOM format.
4584
4585        Parameters
4586        ----------
4587        generated_by : str
4588            a string describing the software used to build the table
4589        direct_io : file or file-like object, optional
4590            Defaults to ``None``. Must implementing a ``write`` function. If
4591            `direct_io` is not ``None``, the final output is written directly
4592            to `direct_io` during processing.
4593
4594        Returns
4595        -------
4596        str
4597            A JSON-formatted string representing the biom table
4598        """
4599        if not isinstance(generated_by, string_types):
4600            raise TableException("Must specify a generated_by string")
4601
4602        # Fill in top-level metadata.
4603        if direct_io:
4604            direct_io.write(u'{')
4605            direct_io.write(u'"id": "%s",' % str(self.table_id))
4606            direct_io.write(
4607                u'"format": "%s",' %
4608                get_biom_format_version_string((1, 0)))  # JSON table -> 1.0.0
4609            direct_io.write(
4610                u'"format_url": "%s",' %
4611                get_biom_format_url_string())
4612            direct_io.write(u'"generated_by": "%s",' % generated_by)
4613            direct_io.write(u'"date": "%s",' % datetime.now().isoformat())
4614        else:
4615            id_ = u'"id": "%s",' % str(self.table_id)
4616            format_ = u'"format": "%s",' % get_biom_format_version_string(
4617                (1, 0))  # JSON table -> 1.0.0
4618            format_url = u'"format_url": "%s",' % get_biom_format_url_string()
4619            generated_by = u'"generated_by": "%s",' % generated_by
4620            date = u'"date": "%s",' % datetime.now().isoformat()
4621
4622        # Determine if we have any data in the matrix, and what the shape of
4623        # the matrix is.
4624        try:
4625            num_rows, num_cols = self.shape
4626        except:  # noqa
4627            num_rows = num_cols = 0
4628        has_data = True if num_rows > 0 and num_cols > 0 else False
4629
4630        # Default the matrix element type to test to be an integer in case we
4631        # don't have any data in the matrix to test.
4632        test_element = 0
4633        if has_data:
4634            test_element = self[0, 0]
4635
4636        # Determine the type of elements the matrix is storing.
4637        if isinstance(test_element, int):
4638            matrix_element_type = u"int"
4639        elif isinstance(test_element, float):
4640            matrix_element_type = u"float"
4641        elif isinstance(test_element, string_types):
4642            matrix_element_type = u"str"
4643        else:
4644            raise TableException("Unsupported matrix data type.")
4645
4646        # Fill in details about the matrix.
4647        if direct_io:
4648            direct_io.write(
4649                u'"matrix_element_type": "%s",' %
4650                matrix_element_type)
4651            direct_io.write(u'"shape": [%d, %d],' % (num_rows, num_cols))
4652        else:
4653            matrix_element_type = u'"matrix_element_type": "%s",' % \
4654                matrix_element_type
4655            shape = u'"shape": [%d, %d],' % (num_rows, num_cols)
4656
4657        # Fill in the table type
4658        if self.type is None:
4659            type_ = u'"type": null,'
4660        else:
4661            type_ = u'"type": "%s",' % self.type
4662
4663        if direct_io:
4664            direct_io.write(type_)
4665
4666        # Fill in details about the rows in the table and fill in the matrix's
4667        # data. BIOM 2.0+ is now only sparse
4668        if direct_io:
4669            direct_io.write(u'"matrix_type": "sparse",')
4670            direct_io.write(u'"data": [')
4671        else:
4672            matrix_type = u'"matrix_type": "sparse",'
4673            data = [u'"data": [']
4674
4675        max_row_idx = len(self.ids(axis='observation')) - 1
4676        max_col_idx = len(self.ids()) - 1
4677        rows = [u'"rows": [']
4678        have_written = False
4679        for obs_index, obs in enumerate(self.iter(axis='observation')):
4680            # i'm crying on the inside
4681            if obs_index != max_row_idx:
4682                rows.append(u'{"id": %s, "metadata": %s},' % (dumps(obs[1]),
4683                                                              dumps(obs[2])))
4684            else:
4685                rows.append(u'{"id": %s, "metadata": %s}],' % (dumps(obs[1]),
4686                                                               dumps(obs[2])))
4687
4688            # turns out its a pain to figure out when to place commas. the
4689            # simple work around, at the expense of a little memory
4690            # (bound by the number of samples) is to build of what will be
4691            # written, and then add in the commas where necessary.
4692            built_row = []
4693            for col_index, val in enumerate(obs[0]):
4694                if float(val) != 0.0:
4695                    built_row.append(u"[%d,%d,%r]" % (obs_index, col_index,
4696                                                      val))
4697            if built_row:
4698                # if we have written a row already, its safe to add a comma
4699                if have_written:
4700                    if direct_io:
4701                        direct_io.write(u',')
4702                    else:
4703                        data.append(u',')
4704                if direct_io:
4705                    direct_io.write(u','.join(built_row))
4706                else:
4707                    data.append(u','.join(built_row))
4708
4709                have_written = True
4710
4711        # finalize the data block
4712        if direct_io:
4713            direct_io.write(u"],")
4714        else:
4715            data.append(u"],")
4716
4717        # Fill in details about the columns in the table.
4718        columns = [u'"columns": [']
4719        for samp_index, samp in enumerate(self.iter()):
4720            if samp_index != max_col_idx:
4721                columns.append(u'{"id": %s, "metadata": %s},' % (
4722                    dumps(samp[1]), dumps(samp[2])))
4723            else:
4724                columns.append(u'{"id": %s, "metadata": %s}]' % (
4725                    dumps(samp[1]), dumps(samp[2])))
4726
4727        if rows[0] == u'"rows": [' and len(rows) == 1:
4728            # empty table case
4729            rows = [u'"rows": [],']
4730            columns = [u'"columns": []']
4731
4732        rows = u''.join(rows)
4733        columns = u''.join(columns)
4734
4735        if direct_io:
4736            direct_io.write(rows)
4737            direct_io.write(columns)
4738            direct_io.write(u'}')
4739        else:
4740            return u"{%s}" % ''.join([id_, format_, format_url, matrix_type,
4741                                      generated_by, date, type_,
4742                                      matrix_element_type, shape,
4743                                      u''.join(data), rows, columns])
4744
4745    @staticmethod
4746    def from_adjacency(lines):
4747        """Parse an adjacency format into BIOM
4748
4749        Parameters
4750        ----------
4751        lines : list, str, or file-like object
4752            The tab delimited data to parse
4753
4754        Returns
4755        -------
4756        biom.Table
4757            A BIOM ``Table`` object
4758
4759        Notes
4760        -----
4761        The input is expected to be of the form: observation, sample, value. A
4762        header is not required, but if present, it must be of the form:
4763
4764        #OTU ID<tab>SampleID<tab>value
4765
4766        Raises
4767        ------
4768        ValueError
4769            If the input is not an iterable or file-like object.
4770        ValueError
4771            If the data is incorrectly formatted.
4772
4773        Examples
4774        --------
4775        Parse tab separated adjacency data into a table:
4776
4777        >>> from biom.table import Table
4778        >>> from io import StringIO
4779        >>> data = 'a\\tb\\t1\\na\\tc\\t2\\nd\\tc\\t3'
4780        >>> data_fh = StringIO(data)
4781        >>> test_table = Table.from_adjacency(data_fh)
4782        """
4783        if not isinstance(lines, (list, tuple)):
4784            if hasattr(lines, 'readlines'):
4785                lines = lines.readlines()
4786            elif hasattr(lines, 'splitlines'):
4787                lines = lines.splitlines()
4788            else:
4789                raise ValueError("Not sure how to handle this input")
4790
4791        def is_num(item):
4792            # from https://stackoverflow.com/a/23059703
4793            numeric = re.compile(r'(?=.)([+-]?([0-9]*)(\.([0-9]+))?)([eE][+-]?\d+)?')  # noqa
4794            match = numeric.match(item)
4795            start, stop = match.span()
4796            if (stop - start) == len(item):
4797                return True
4798            else:
4799                return False
4800
4801        # sanity check and determine if we have a header or not
4802        lh = lines[0].strip().split('\t')
4803        if len(lh) != 3:
4804            raise ValueError("Does not appear to be an adjacency format")
4805        elif lh == ['#OTU ID', 'SampleID', 'value']:
4806            include_line_zero = False
4807        elif is_num(lh[2]):
4808            # allow anything for columns 1 and 2, but test that column 3 is
4809            # numeric
4810            include_line_zero = True
4811        else:
4812            raise ValueError("Does not appear to be an adjacency format")
4813
4814        if not include_line_zero:
4815            lines = lines[1:]
4816
4817        # extract the entities
4818        observations = []
4819        samples = []
4820        values = []
4821        for line in lines:
4822            parts = line.split('\t')
4823            assert len(parts) == 3
4824            observations.append(parts[0])
4825            samples.append(parts[1])
4826            values.append(float(parts[2]))
4827
4828        # determine a stable order and index positioning for the identifiers
4829        obs_order = sorted(set(observations))
4830        samp_order = sorted(set(samples))
4831        obs_index = {o: i for i, o in enumerate(obs_order)}
4832        samp_index = {s: i for i, s in enumerate(samp_order)}
4833
4834        # fill the matrix
4835        row = np.array([obs_index[obs] for obs in observations], dtype=int)
4836        col = np.array([samp_index[samp] for samp in samples], dtype=int)
4837        data = np.asarray(values)
4838        mat = coo_matrix((data, (row, col)))
4839
4840        return Table(mat, obs_order, samp_order)
4841
4842    @staticmethod
4843    def from_tsv(lines, obs_mapping, sample_mapping,
4844                 process_func, **kwargs):
4845        """Parse a tab separated (observation x sample) formatted BIOM table
4846
4847        Parameters
4848        ----------
4849        lines : list, or file-like object
4850            The tab delimited data to parse
4851        obs_mapping : dict or None
4852            The corresponding observation metadata
4853        sample_mapping : dict or None
4854            The corresponding sample metadata
4855        process_func : function
4856            A function to transform the observation metadata
4857
4858        Returns
4859        -------
4860        biom.Table
4861            A BIOM ``Table`` object
4862
4863        Examples
4864        --------
4865        Parse tab separated data into a table:
4866
4867        >>> from biom.table import Table
4868        >>> from io import StringIO
4869        >>> tsv = 'a\\tb\\tc\\n1\\t2\\t3\\n4\\t5\\t6'
4870        >>> tsv_fh = StringIO(tsv)
4871        >>> func = lambda x : x
4872        >>> test_table = Table.from_tsv(tsv_fh, None, None, func)
4873        """
4874        (sample_ids, obs_ids, data, t_md,
4875         t_md_name) = Table._extract_data_from_tsv(lines, **kwargs)
4876
4877        # if we have it, keep it
4878        if t_md is None:
4879            obs_metadata = None
4880        else:
4881            obs_metadata = [{t_md_name: process_func(v)} for v in t_md]
4882
4883        if sample_mapping is None:
4884            sample_metadata = None
4885        else:
4886            sample_metadata = [sample_mapping[sample_id]
4887                               for sample_id in sample_ids]
4888
4889        # will override any metadata from parsed table
4890        if obs_mapping is not None:
4891            obs_metadata = [obs_mapping[obs_id] for obs_id in obs_ids]
4892
4893        return Table(data, obs_ids, sample_ids, obs_metadata, sample_metadata)
4894
4895    @staticmethod
4896    def _extract_data_from_tsv(lines, delim='\t', dtype=float, md_parse=None):
4897        """Parse a classic table into (sample_ids, obs_ids, data, metadata,
4898        name)
4899
4900        Parameters
4901        ----------
4902        lines: list or file-like object
4903            delimted data to parse
4904        delim: string
4905            delimeter in file lines
4906        dtype: type
4907        md_parse:  function or None
4908            funtion used to parse metdata
4909
4910        Returns
4911        -------
4912        list
4913            sample_ids
4914        list
4915            observation_ids
4916        array
4917            data
4918        list
4919            metadata
4920        string
4921            column name if last column is non-numeric
4922
4923        Notes
4924        ------
4925        This is intended to be close to how QIIME classic OTU tables are parsed
4926        with the exception of the additional md_name field
4927
4928        This function is ported from QIIME (http://www.qiime.org), previously
4929        named parse_classic_otu_table. QIIME is a GPL project, but we obtained
4930        permission from the authors of this function to port it to the BIOM
4931        Format project (and keep it under BIOM's BSD license).
4932
4933        .. shownumpydoc
4934        """
4935        def isfloat(value):
4936            # see https://stackoverflow.com/a/20929881
4937            try:
4938                float(value)
4939                return True
4940            except ValueError:
4941                return False
4942
4943        if not isinstance(lines, list):
4944            try:
4945                hasattr(lines, 'seek')
4946            except AttributeError:
4947                raise RuntimeError(
4948                    "Input needs to support seek or be indexable")
4949
4950        # find header, the first line that is not empty and does not start
4951        # with a #
4952        header = False
4953        list_index = 0
4954        for line in lines:
4955            if not line.strip():
4956                continue
4957            if not line.startswith('#'):
4958                # Covers the case where the first line is the header
4959                # and there is no indication of it (no comment character)
4960                if not header:
4961                    header = line.rstrip().split(delim)[1:]
4962                    data_start = list_index + 1
4963                else:
4964                    data_start = list_index
4965                break
4966            list_index += 1
4967            header = line.strip().split(delim)[1:]
4968
4969        # If the first line is the header, then we need to get the data lines
4970        # line for the "last column" check
4971        if isinstance(lines, list):
4972            value_checks = lines[data_start:]
4973        else:
4974            lines.seek(0)
4975            for index in range(0, data_start):
4976                lines.readline()
4977            value_checks = [line for line in lines]
4978
4979        # attempt to determine if the last column is non-numeric, ie, metadata
4980        last_values = [line.rsplit(delim, 1)[-1].strip()
4981                       for line in value_checks]
4982        last_column_is_numeric = all([isfloat(i) for i in last_values])
4983
4984        # determine sample ids
4985        if last_column_is_numeric:
4986            md_name = None
4987            metadata = None
4988            samp_ids = header[:]
4989        else:
4990            md_name = header[-1]
4991            metadata = []
4992            samp_ids = header[:-1]
4993
4994        data = []
4995        obs_ids = []
4996        row_number = 0
4997
4998        # Go back to the beginning if it is a file:
4999        if hasattr(lines, 'seek'):
5000            lines.seek(0)
5001            for index in range(0, data_start):
5002                line = lines.readline()
5003        else:
5004            lines = lines[data_start:]
5005
5006        for lineno, line in enumerate(lines, data_start):
5007            if not line.strip():
5008                continue
5009            if line.startswith('#'):
5010                continue
5011
5012            fields = line.split(delim)
5013            fields[-1] = fields[-1].strip()
5014            obs_ids.append(fields[0])
5015
5016            if last_column_is_numeric:
5017                try:
5018                    values = list(map(dtype, fields[1:]))
5019                except ValueError:
5020                    badval, badidx = _identify_bad_value(dtype, fields[1:])
5021                    msg = "Invalid value on line %d, column %d, value %s"
5022                    raise TypeError(msg % (lineno, badidx+1, badval))
5023            else:
5024                try:
5025                    values = list(map(dtype, fields[1:-1]))
5026                except ValueError:
5027                    badval, badidx = _identify_bad_value(dtype, fields[1:])
5028                    msg = "Invalid value on line %d, column %d, value %s"
5029                    raise TypeError(msg % (lineno, badidx+1, badval))
5030
5031                if md_parse is not None:
5032                    metadata.append(md_parse(fields[-1]))
5033                else:
5034                    metadata.append(fields[-1])
5035            for column_number in range(0, len(values)):
5036                if values[column_number] != dtype(0):
5037                    data.append([row_number, column_number,
5038                                 values[column_number]])
5039            row_number += 1
5040        return samp_ids, obs_ids, data, metadata, md_name
5041
5042    def to_tsv(self, header_key=None, header_value=None,
5043               metadata_formatter=str,
5044               observation_column_name='#OTU ID',
5045               direct_io=None):
5046        """Return self as a string in tab delimited form
5047
5048        Default ``str`` output for the ``Table`` is just row/col ids and table
5049        data without any metadata
5050
5051        Parameters
5052        ----------
5053        header_key : str or ``None``, optional
5054            Defaults to ``None``
5055        header_value : str or ``None``, optional
5056            Defaults to ``None``
5057        metadata_formatter : function, optional
5058            Defaults to ``str``.  a function which takes a metadata entry and
5059            returns a formatted version that should be written to file
5060        observation_column_name : str, optional
5061            Defaults to "#OTU ID". The name of the first column in the output
5062            table, corresponding to the observation IDs.
5063        direct_io : file or file-like object, optional
5064            Defaults to ``None``. Must implement a ``write`` function. If
5065            `direct_io` is not ``None``, the final output is written directly
5066            to `direct_io` during processing.
5067
5068        Returns
5069        -------
5070        str
5071            tab delimited representation of the Table
5072
5073        Examples
5074        --------
5075
5076        >>> import numpy as np
5077        >>> from biom.table import Table
5078
5079        Create a 2x3 BIOM table, with observation metadata and no sample
5080        metadata:
5081
5082        >>> data = np.asarray([[0, 0, 1], [1, 3, 42]])
5083        >>> table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'],
5084        ...               [{'foo': 'bar'}, {'x': 'y'}], None)
5085        >>> print(table.to_tsv()) # doctest: +NORMALIZE_WHITESPACE
5086        # Constructed from biom file
5087        #OTU ID	S1	S2	S3
5088        O1	0.0	0.0	1.0
5089        O2	1.0	3.0	42.0
5090        >>> with open("result.tsv", "w") as f:
5091                table.to_tsv(direct_io=f)
5092        """
5093        return self.delimited_self(u'\t', header_key, header_value,
5094                                   metadata_formatter,
5095                                   observation_column_name,
5096                                   direct_io=direct_io)
5097
5098
5099def coo_arrays_to_sparse(data, dtype=np.float64, shape=None):
5100    """Map directly on to the coo_matrix constructor
5101
5102    Parameters
5103    ----------
5104    data : tuple
5105        data must be (values, (rows, cols))
5106    dtype : type, optional
5107        Defaults to ``np.float64``
5108    shape : tuple or ``None``, optional
5109        Defaults to ``None``. If `shape` is ``None``, shape will be determined
5110        automatically from `data`.
5111    """
5112    if shape is None:
5113        values, (rows, cols) = data
5114        n_rows = max(rows) + 1
5115        n_cols = max(cols) + 1
5116    else:
5117        n_rows, n_cols = shape
5118
5119    # coo_matrix allows zeros to be added as data, and this affects
5120    # nnz, items, and iteritems. Clean them out here, as this is
5121    # the only time these zeros can creep in.
5122    # Note: coo_matrix allows duplicate entries; the entries will
5123    # be summed when converted. Not really sure how we want to
5124    # handle this generally within BIOM- I'm okay with leaving it
5125    # as undefined behavior for now.
5126    matrix = coo_matrix(data, shape=(n_rows, n_cols), dtype=dtype)
5127    matrix = matrix.tocsr()
5128    matrix.eliminate_zeros()
5129    return matrix
5130
5131
5132def list_list_to_sparse(data, dtype=float, shape=None):
5133    """Convert a list of lists into a scipy.sparse matrix.
5134
5135    Parameters
5136    ----------
5137    data : iterable of iterables
5138        `data` should be in the format [[row, col, value], ...]
5139    dtype : type, optional
5140        defaults to ``float``
5141    shape : tuple or ``None``, optional
5142        Defaults to ``None``. If `shape` is ``None``, shape will be determined
5143        automatically from `data`.
5144
5145    Returns
5146    -------
5147    scipy.csr_matrix
5148        The newly generated matrix
5149    """
5150    rows, cols, values = zip(*data)
5151
5152    if shape is None:
5153        n_rows = max(rows) + 1
5154        n_cols = max(cols) + 1
5155    else:
5156        n_rows, n_cols = shape
5157
5158    matrix = coo_matrix((values, (rows, cols)), shape=(n_rows, n_cols),
5159                        dtype=dtype)
5160    matrix = matrix.tocsr()
5161    matrix.eliminate_zeros()
5162    return matrix
5163
5164
5165def nparray_to_sparse(data, dtype=float):
5166    """Convert a numpy array to a scipy.sparse matrix.
5167
5168    Parameters
5169    ----------
5170    data : numpy.array
5171        The data to convert into a sparse matrix
5172    dtype : type, optional
5173        Defaults to ``float``. The type of data to be represented.
5174
5175    Returns
5176    -------
5177    scipy.csr_matrix
5178        The newly generated matrix
5179    """
5180    if data.shape == (0,):
5181        # an empty vector. Note, this short circuit is necessary as calling
5182        # csr_matrix([], shape=(0, 0), dtype=dtype) will result in a matrix
5183        # has a shape of (1, 0).
5184        return csr_matrix((0, 0), dtype=dtype)
5185    elif data.shape in ((1, 0), (0, 1)) and data.size == 0:
5186        # an empty matrix. This short circuit is necessary for the same reason
5187        # as the empty vector. While a (1, 0) matrix is _empty_, this does
5188        # confound code that assumes that (1, 0) means there might be metadata
5189        # or IDs associated with that singular row
5190        return csr_matrix((0, 0), dtype=dtype)
5191    elif len(data.shape) == 1:
5192        # a vector
5193        shape = (1, data.shape[0])
5194    else:
5195        shape = data.shape
5196
5197    matrix = coo_matrix(data, shape=shape, dtype=dtype)
5198    matrix = matrix.tocsr()
5199    matrix.eliminate_zeros()
5200    return matrix
5201
5202
5203def list_nparray_to_sparse(data, dtype=float):
5204    """Takes a list of numpy arrays and creates a scipy.sparse matrix.
5205
5206    Parameters
5207    ----------
5208    data : iterable of numpy.array
5209        The data to convert into a sparse matrix
5210    dtype : type, optional
5211        Defaults to ``float``. The type of data to be represented.
5212
5213    Returns
5214    -------
5215    scipy.csr_matrix
5216        The newly generated matrix
5217    """
5218    matrix = coo_matrix(data, shape=(len(data), len(data[0])), dtype=dtype)
5219    matrix = matrix.tocsr()
5220    matrix.eliminate_zeros()
5221    return matrix
5222
5223
5224def list_sparse_to_sparse(data, dtype=float):
5225    """Takes a list of scipy.sparse matrices and creates a scipy.sparse mat.
5226
5227    Parameters
5228    ----------
5229    data : iterable of scipy.sparse matrices
5230        The data to convert into a sparse matrix
5231    dtype : type, optional
5232        Defaults to ``float``. The type of data to be represented.
5233
5234    Returns
5235    -------
5236    scipy.csr_matrix
5237        The newly generated matrix
5238    """
5239    if isspmatrix(data[0]):
5240        if data[0].shape[0] > data[0].shape[1]:
5241            n_cols = len(data)
5242            n_rows = data[0].shape[0]
5243        else:
5244            n_rows = len(data)
5245            n_cols = data[0].shape[1]
5246    else:
5247        all_keys = flatten([d.keys() for d in data])
5248        n_rows = max(all_keys, key=itemgetter(0))[0] + 1
5249        n_cols = max(all_keys, key=itemgetter(1))[1] + 1
5250        if n_rows > n_cols:
5251            n_cols = len(data)
5252        else:
5253            n_rows = len(data)
5254
5255    data = vstack(data)
5256    matrix = coo_matrix(data, shape=(n_rows, n_cols),
5257                        dtype=dtype)
5258    matrix = matrix.tocsr()
5259    matrix.eliminate_zeros()
5260    return matrix
5261
5262
5263def list_dict_to_sparse(data, dtype=float):
5264    """Takes a list of dict {(row,col):val} and creates a scipy.sparse mat.
5265
5266    Parameters
5267    ----------
5268    data : iterable of dicts
5269        The data to convert into a sparse matrix
5270    dtype : type, optional
5271        Defaults to ``float``. The type of data to be represented.
5272
5273    Returns
5274    -------
5275    scipy.csr_matrix
5276        The newly generated matrix
5277    """
5278    if isspmatrix(data[0]):
5279        if data[0].shape[0] > data[0].shape[1]:
5280            is_col = True
5281            n_cols = len(data)
5282            n_rows = data[0].shape[0]
5283        else:
5284            is_col = False
5285            n_rows = len(data)
5286            n_cols = data[0].shape[1]
5287    else:
5288        all_keys = flatten([d.keys() for d in data])
5289        n_rows = max(all_keys, key=itemgetter(0))[0] + 1
5290        n_cols = max(all_keys, key=itemgetter(1))[1] + 1
5291        if n_rows > n_cols:
5292            is_col = True
5293            n_cols = len(data)
5294        else:
5295            is_col = False
5296            n_rows = len(data)
5297
5298    rows = []
5299    cols = []
5300    vals = []
5301    for row_idx, row in enumerate(data):
5302        for (row_val, col_idx), val in row.items():
5303            if is_col:
5304                # transpose
5305                rows.append(row_val)
5306                cols.append(row_idx)
5307                vals.append(val)
5308            else:
5309                rows.append(row_idx)
5310                cols.append(col_idx)
5311                vals.append(val)
5312
5313    matrix = coo_matrix((vals, (rows, cols)), shape=(n_rows, n_cols),
5314                        dtype=dtype)
5315    matrix = matrix.tocsr()
5316    matrix.eliminate_zeros()
5317    return matrix
5318
5319
5320def dict_to_sparse(data, dtype=float, shape=None):
5321    """Takes a dict {(row,col):val} and creates a scipy.sparse matrix.
5322
5323    Parameters
5324    ----------
5325    data : dict
5326        The data to convert into a sparse matrix
5327    dtype : type, optional
5328        Defaults to ``float``. The type of data to be represented.
5329
5330    Returns
5331    -------
5332    scipy.csr_matrix
5333        The newly generated matrix
5334    """
5335    if shape is None:
5336        n_rows = max(data.keys(), key=itemgetter(0))[0] + 1
5337        n_cols = max(data.keys(), key=itemgetter(1))[1] + 1
5338    else:
5339        n_rows, n_cols = shape
5340
5341    rows = []
5342    cols = []
5343    vals = []
5344    for (r, c), v in viewitems(data):
5345        rows.append(r)
5346        cols.append(c)
5347        vals.append(v)
5348
5349    return coo_arrays_to_sparse((vals, (rows, cols)),
5350                                shape=(n_rows, n_cols), dtype=dtype)
5351