1import numpy as np 2import pandas as pd 3from scipy.sparse import issparse 4from pointannotator.annotate_samples import ( 5 PFUN_BINOMIAL, 6 SCORING_LOG_FDR, 7 SCORING_EXP_RATIO, 8 PFUN_HYPERGEOMETRIC, 9 SCORING_MARKERS_SUM, 10 AnnotateSamples, 11) 12 13from Orange.data import Table, Domain, ContinuousVariable 14 15from orangecontrib.bioinformatics.ncbi.gene import GeneInfo 16from orangecontrib.bioinformatics.widgets.utils.data import TAX_ID 17 18__all__ = [ 19 "PFUN_BINOMIAL", 20 "PFUN_HYPERGEOMETRIC", 21 "SCORING_EXP_RATIO", 22 "SCORING_LOG_FDR", 23 "SCORING_MARKERS_SUM", 24 "AnnotateSamplesMeta", 25] 26 27 28class ScoringNotImplemented(Exception): 29 pass 30 31 32class AnnotateSamplesMeta: 33 """ 34 AnnotateSamples is a meta class used for the annotation of data items with 35 the labels Mann-Whitney U test for selecting important values and 36 the Hyper-geometric for assigning the labels. This class is a proxy to the 37 external library - point-annotator. 38 39 Example for full annotation: 40 41 >>> from Orange.data import Table 42 >>> from orangecontrib.bioinformatics.utils import serverfiles 43 >>> from orangecontrib.bioinformatics.annotation.annotate_samples import \ 44 ... AnnotateSamples 45 >>> from orangecontrib.bioinformatics.widgets.utils.data import TAX_ID 46 >>> from Orange.data.filter import FilterString, Values 47 >>> 48 >>> data = Table("https://datasets.orange.biolab.si/sc/aml-1k.tab.gz") 49 >>> data.attributes[TAX_ID] = "9606" # table needs to have an organism ID 50 >>> markers_path = serverfiles.localpath_download( 51 ... 'marker_genes','panglao_gene_markers.tab') 52 >>> marker = Table(markers_path) 53 >>> 54 >>> # filter only human markers 55 >>> f = FilterString("Organism", FilterString.Equal, "Human") 56 >>> markers = Values([f])(marker) 57 >>> 58 >>> annotator = AnnotateSamples 59 >>> z = annotator.mann_whitney_test(data) 60 >>> scores, p_val = AnnotateSamples.assign_annotations(z, markers, data) 61 >>> scores = annotator.filter_annotations(scores, p_val, p_threshold=0.05) 62 """ 63 64 @staticmethod 65 def _to_pandas(data, use_entrez_id=False): 66 """ 67 Transform Orange table to pandas dataframe. 68 69 Parameters 70 ---------- 71 data : Orange.data.Table 72 Orange table with gene data. We suppose that genes have `Entrez ID` 73 assigned. 74 use_entrez_id : bool 75 This parameter tells whether to use Entrez ID as column names. 76 When it is False variables names will be used 77 78 Returns 79 ------- 80 pd.DataFrame 81 Pandas DataFrame with EntrezID 82 list 83 List of booleans with True on location of included columns in 84 dataframe. 85 """ 86 if use_entrez_id: 87 entrez_ids = [x.attributes.get("Entrez ID") for x in data.domain.attributes] 88 has_entrez_id = np.array([x is not None for x in entrez_ids]) 89 columns = list(map(str, np.array(entrez_ids))) 90 columns_subset = has_entrez_id.tolist() 91 else: 92 columns = list(map(str, data.domain.attributes)) 93 columns_subset = None 94 95 if issparse(data.X): 96 df = pd.DataFrame.sparse.from_spmatrix(data.X, columns=columns) 97 else: 98 df = pd.DataFrame(data.X, columns=columns) 99 return (df if columns_subset is None else df.loc[:, columns_subset], columns_subset) 100 101 @staticmethod 102 def mann_whitney_test(data): 103 """ 104 Compute z values with Mann-Whitney U test. 105 106 Parameters 107 ---------- 108 data : Orange.data.Table 109 Tabular data with gene expressions 110 111 Returns 112 ------- 113 Orange.data.Table 114 Z-value for each item. 115 """ 116 df, columns = AnnotateSamplesMeta._to_pandas(data, use_entrez_id=False) 117 df_z_values = AnnotateSamples.mann_whitney_test(df) 118 119 z_table = Table(data.domain, df_z_values.values, data.Y, metas=data.metas) 120 121 return z_table 122 123 @staticmethod 124 def _prepare_annotations(ann): 125 """ 126 This function prepares annotation to be acceptable by the method. 127 """ 128 # transform to string if discrete else keep string 129 ct_values = ( 130 list(map(ann.domain["Cell Type"].repr_val, ann.get_column_view("Cell Type")[0])) 131 if ann.domain["Cell Type"].is_discrete 132 else ann.get_column_view("Cell Type")[0] 133 ) 134 entrez_values = ( 135 list(map(ann.domain["Entrez ID"].repr_val, ann.get_column_view("Entrez ID")[0])) 136 if ann.domain["Entrez ID"].is_discrete 137 else ann.get_column_view("Entrez ID")[0] 138 ) 139 140 # the framework recognizes Gene instead of Entrez ID 141 df_available_annotations = pd.DataFrame(list(zip(ct_values, entrez_values)), columns=["Cell Type", "Gene"]) 142 return df_available_annotations 143 144 @staticmethod 145 def assign_annotations( 146 z_values, available_annotations, data, z_threshold=1, p_value_fun=PFUN_BINOMIAL, scoring=SCORING_EXP_RATIO 147 ): 148 """ 149 The function gets a set of attributes (e.g. genes) for each cell and 150 attributes for each annotation. It returns the annotations significant 151 for each cell. 152 153 Parameters 154 ---------- 155 z_values : Orange.data.Table 156 Table which show z values for each item 157 available_annotations : Orange.data.Table 158 Available annotations (e.g. cell types) 159 z_threshold : float 160 The threshold for selecting the attribute. For each item the 161 attributes with z-value above this value are selected. 162 p_value_fun : str, optional (defaults: TEST_BINOMIAL) 163 A function that calculates p-value. It can be either 164 PFUN_BINOMIAL that uses statistics.Binomial().p_value or 165 PFUN_HYPERGEOMETRIC that uses hypergeom.sf. 166 data : Orange.data.Table 167 Tabular data with gene expressions - we need that to compute scores. 168 scoring : str, optional (default=SCORING_EXP_RATIO) 169 Type of scoring 170 171 Returns 172 ------- 173 Orange.data.Table 174 Annotation probabilities 175 Orange.data.Table 176 Annotation fdrs 177 """ 178 # checks that assures that data are ok 179 assert TAX_ID in data.attributes, "The input table needs to have a " "tax_id attribute" 180 assert any( 181 "Entrez ID" in x.attributes for x in data.domain.attributes 182 ), "Input data do not contain gene expression data." 183 184 # retrieve number of genes 185 tax_id = data.attributes[TAX_ID] 186 n = len(GeneInfo(tax_id)) # number of genes for organism 187 188 # transform data to pandas dataframe 189 df_z_values, _ = AnnotateSamplesMeta._to_pandas(z_values, use_entrez_id=True) 190 df_data, _ = AnnotateSamplesMeta._to_pandas(data, use_entrez_id=True) 191 df_available_annotations = AnnotateSamplesMeta._prepare_annotations(available_annotations) 192 df_available_annotations = df_available_annotations[df_available_annotations["Gene"] != "?"] 193 194 # call the method 195 scores, fdrs = AnnotateSamples.assign_annotations( 196 df_z_values, 197 df_available_annotations, 198 df_data, 199 num_all_attributes=n, 200 attributes_col="Gene", 201 annotations_col="Cell Type", 202 z_threshold=z_threshold, 203 p_value_fun=p_value_fun, 204 scoring=scoring, 205 ) 206 # create orange tables 207 domain = Domain([ContinuousVariable(ct) for ct in scores.columns.values]) 208 scores_table = Table(domain, scores.values) 209 fdrs_table = Table(domain, fdrs.values) 210 211 return scores_table, fdrs_table 212 213 @staticmethod 214 def filter_annotations(scores, p_values, return_nonzero_annotations=True, p_threshold=0.05): 215 """ 216 This function filters the probabilities on places that do not reach the 217 threshold for p-value and filter zero columns 218 return_nonzero_annotations is True. 219 220 Parameters 221 ---------- 222 scores : Orange.data.Table 223 Scores for each annotations for each cell 224 p_values : Orange.data.Table 225 p-value scores for annotations for each cell 226 return_nonzero_annotations : bool 227 Flag that enables filtering the non-zero columns. 228 p_threshold : float 229 A threshold for accepting the annotations. Annotations that has FDR 230 value bellow this threshold are used. 231 232 Returns 233 ------- 234 Orange.data.Table 235 Filtered scores for each annotations for each cell 236 """ 237 df_scores, _ = AnnotateSamplesMeta._to_pandas(scores) 238 df_p_values, _ = AnnotateSamplesMeta._to_pandas(p_values) 239 scores = AnnotateSamples.filter_annotations(df_scores, df_p_values, return_nonzero_annotations, p_threshold) 240 241 domain = Domain([ContinuousVariable(ct) for ct in scores.columns.values]) 242 scores_table = Table(domain, scores.values) 243 return scores_table 244