1import numpy as np
2import pandas as pd
3from scipy.sparse import issparse
4from pointannotator.annotate_samples import (
5    PFUN_BINOMIAL,
6    SCORING_LOG_FDR,
7    SCORING_EXP_RATIO,
8    PFUN_HYPERGEOMETRIC,
9    SCORING_MARKERS_SUM,
10    AnnotateSamples,
11)
12
13from Orange.data import Table, Domain, ContinuousVariable
14
15from orangecontrib.bioinformatics.ncbi.gene import GeneInfo
16from orangecontrib.bioinformatics.widgets.utils.data import TAX_ID
17
18__all__ = [
19    "PFUN_BINOMIAL",
20    "PFUN_HYPERGEOMETRIC",
21    "SCORING_EXP_RATIO",
22    "SCORING_LOG_FDR",
23    "SCORING_MARKERS_SUM",
24    "AnnotateSamplesMeta",
25]
26
27
28class ScoringNotImplemented(Exception):
29    pass
30
31
32class AnnotateSamplesMeta:
33    """
34    AnnotateSamples is a meta class used for the annotation of data items with
35    the labels Mann-Whitney U test for selecting important values and
36    the Hyper-geometric for assigning the labels. This class is a proxy to the
37    external library - point-annotator.
38
39    Example for full annotation:
40
41    >>> from Orange.data import Table
42    >>> from orangecontrib.bioinformatics.utils import serverfiles
43    >>> from orangecontrib.bioinformatics.annotation.annotate_samples import \
44    ...     AnnotateSamples
45    >>> from orangecontrib.bioinformatics.widgets.utils.data import TAX_ID
46    >>> from Orange.data.filter import FilterString, Values
47    >>>
48    >>> data = Table("https://datasets.orange.biolab.si/sc/aml-1k.tab.gz")
49    >>> data.attributes[TAX_ID] = "9606"  # table needs to have an organism ID
50    >>> markers_path = serverfiles.localpath_download(
51    ...     'marker_genes','panglao_gene_markers.tab')
52    >>> marker = Table(markers_path)
53    >>>
54    >>> # filter only human markers
55    >>> f = FilterString("Organism", FilterString.Equal, "Human")
56    >>> markers = Values([f])(marker)
57    >>>
58    >>> annotator = AnnotateSamples
59    >>> z = annotator.mann_whitney_test(data)
60    >>> scores, p_val = AnnotateSamples.assign_annotations(z, markers, data)
61    >>> scores = annotator.filter_annotations(scores, p_val, p_threshold=0.05)
62    """
63
64    @staticmethod
65    def _to_pandas(data, use_entrez_id=False):
66        """
67        Transform Orange table to pandas dataframe.
68
69        Parameters
70        ----------
71        data : Orange.data.Table
72            Orange table with gene data. We suppose that genes have `Entrez ID`
73            assigned.
74        use_entrez_id : bool
75            This parameter tells whether to use Entrez ID as column names.
76            When it is False variables names will be used
77
78        Returns
79        -------
80        pd.DataFrame
81            Pandas DataFrame with EntrezID
82        list
83            List of booleans with True on location of included columns in
84            dataframe.
85        """
86        if use_entrez_id:
87            entrez_ids = [x.attributes.get("Entrez ID") for x in data.domain.attributes]
88            has_entrez_id = np.array([x is not None for x in entrez_ids])
89            columns = list(map(str, np.array(entrez_ids)))
90            columns_subset = has_entrez_id.tolist()
91        else:
92            columns = list(map(str, data.domain.attributes))
93            columns_subset = None
94
95        if issparse(data.X):
96            df = pd.DataFrame.sparse.from_spmatrix(data.X, columns=columns)
97        else:
98            df = pd.DataFrame(data.X, columns=columns)
99        return (df if columns_subset is None else df.loc[:, columns_subset], columns_subset)
100
101    @staticmethod
102    def mann_whitney_test(data):
103        """
104        Compute z values with Mann-Whitney U test.
105
106        Parameters
107        ----------
108        data : Orange.data.Table
109            Tabular data with gene expressions
110
111        Returns
112        -------
113        Orange.data.Table
114            Z-value for each item.
115        """
116        df, columns = AnnotateSamplesMeta._to_pandas(data, use_entrez_id=False)
117        df_z_values = AnnotateSamples.mann_whitney_test(df)
118
119        z_table = Table(data.domain, df_z_values.values, data.Y, metas=data.metas)
120
121        return z_table
122
123    @staticmethod
124    def _prepare_annotations(ann):
125        """
126        This function prepares annotation to be acceptable by the method.
127        """
128        # transform to string if discrete else keep string
129        ct_values = (
130            list(map(ann.domain["Cell Type"].repr_val, ann.get_column_view("Cell Type")[0]))
131            if ann.domain["Cell Type"].is_discrete
132            else ann.get_column_view("Cell Type")[0]
133        )
134        entrez_values = (
135            list(map(ann.domain["Entrez ID"].repr_val, ann.get_column_view("Entrez ID")[0]))
136            if ann.domain["Entrez ID"].is_discrete
137            else ann.get_column_view("Entrez ID")[0]
138        )
139
140        # the framework recognizes Gene instead of Entrez ID
141        df_available_annotations = pd.DataFrame(list(zip(ct_values, entrez_values)), columns=["Cell Type", "Gene"])
142        return df_available_annotations
143
144    @staticmethod
145    def assign_annotations(
146        z_values, available_annotations, data, z_threshold=1, p_value_fun=PFUN_BINOMIAL, scoring=SCORING_EXP_RATIO
147    ):
148        """
149        The function gets a set of attributes (e.g. genes) for each cell and
150        attributes for each annotation. It returns the annotations significant
151        for each cell.
152
153        Parameters
154        ----------
155        z_values : Orange.data.Table
156            Table which show z values for each item
157        available_annotations : Orange.data.Table
158            Available annotations (e.g. cell types)
159        z_threshold : float
160            The threshold for selecting the attribute. For each item the
161            attributes with z-value above this value are selected.
162        p_value_fun : str, optional (defaults: TEST_BINOMIAL)
163            A function that calculates p-value. It can be either
164            PFUN_BINOMIAL that uses statistics.Binomial().p_value or
165            PFUN_HYPERGEOMETRIC that uses hypergeom.sf.
166        data : Orange.data.Table
167            Tabular data with gene expressions - we need that to compute scores.
168        scoring : str, optional (default=SCORING_EXP_RATIO)
169            Type of scoring
170
171        Returns
172        -------
173        Orange.data.Table
174            Annotation probabilities
175        Orange.data.Table
176            Annotation fdrs
177        """
178        # checks that assures that data are ok
179        assert TAX_ID in data.attributes, "The input table needs to have a " "tax_id attribute"
180        assert any(
181            "Entrez ID" in x.attributes for x in data.domain.attributes
182        ), "Input data do not contain gene expression data."
183
184        # retrieve number of genes
185        tax_id = data.attributes[TAX_ID]
186        n = len(GeneInfo(tax_id))  # number of genes for organism
187
188        # transform data to pandas dataframe
189        df_z_values, _ = AnnotateSamplesMeta._to_pandas(z_values, use_entrez_id=True)
190        df_data, _ = AnnotateSamplesMeta._to_pandas(data, use_entrez_id=True)
191        df_available_annotations = AnnotateSamplesMeta._prepare_annotations(available_annotations)
192        df_available_annotations = df_available_annotations[df_available_annotations["Gene"] != "?"]
193
194        # call the method
195        scores, fdrs = AnnotateSamples.assign_annotations(
196            df_z_values,
197            df_available_annotations,
198            df_data,
199            num_all_attributes=n,
200            attributes_col="Gene",
201            annotations_col="Cell Type",
202            z_threshold=z_threshold,
203            p_value_fun=p_value_fun,
204            scoring=scoring,
205        )
206        # create orange tables
207        domain = Domain([ContinuousVariable(ct) for ct in scores.columns.values])
208        scores_table = Table(domain, scores.values)
209        fdrs_table = Table(domain, fdrs.values)
210
211        return scores_table, fdrs_table
212
213    @staticmethod
214    def filter_annotations(scores, p_values, return_nonzero_annotations=True, p_threshold=0.05):
215        """
216        This function filters the probabilities on places that do not reach the
217        threshold for p-value and filter zero columns
218        return_nonzero_annotations is True.
219
220        Parameters
221        ----------
222        scores : Orange.data.Table
223            Scores for each annotations for each cell
224        p_values : Orange.data.Table
225            p-value scores for annotations for each cell
226        return_nonzero_annotations : bool
227            Flag that enables filtering the non-zero columns.
228        p_threshold : float
229            A threshold for accepting the annotations. Annotations that has FDR
230            value bellow this threshold are used.
231
232        Returns
233        -------
234        Orange.data.Table
235            Filtered scores for each annotations for each cell
236        """
237        df_scores, _ = AnnotateSamplesMeta._to_pandas(scores)
238        df_p_values, _ = AnnotateSamplesMeta._to_pandas(p_values)
239        scores = AnnotateSamples.filter_annotations(df_scores, df_p_values, return_nonzero_annotations, p_threshold)
240
241        domain = Domain([ContinuousVariable(ct) for ct in scores.columns.values])
242        scores_table = Table(domain, scores.values)
243        return scores_table
244