1import numpy as np
2from scipy.stats import zscore, rankdata
3from sklearn.preprocessing import quantile_transform
4
5from Orange.data.table import Table
6from Orange.preprocess.preprocess import Preprocess
7
8
9class LogarithmicScale(Preprocess):
10    def __call__(self, data) -> Table:
11        _data = data.copy()
12        _data.X = np.log2(data.X + 1)
13        return _data
14
15
16class ZScore(Preprocess):
17    """
18    Compute the z score.
19
20    Detailed description: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.zscore.html
21    """
22
23    def __init__(self, axis=0):
24        self.axis = axis
25
26    def __call__(self, data) -> Table:
27        _data = data.copy()
28        _data.X = zscore(data.X, axis=self.axis)
29        _data.X[np.isnan(_data.X)] = 0
30        return _data
31
32
33class QuantileTransform(Preprocess):
34    """
35    Transform features to follow a uniform or a normal distribution.
36
37    Detailed description: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.quantile_transform.html
38    """
39
40    def __init__(self, axis=0, n_quantiles=1000, output_distribution='uniform'):
41        self.axis = axis
42        self.n_quantiles = n_quantiles
43        self.output_distribution = output_distribution
44
45    def __call__(self, data) -> Table:
46        _data = data.copy()
47        _data.X = quantile_transform(
48            _data.X,
49            n_quantiles=self.n_quantiles,
50            output_distribution=self.output_distribution,
51            copy=True,
52            axis=self.axis,
53        )
54        return _data
55
56
57class QuantileNormalization(Preprocess):
58    """
59    Quantile normalize a test distribution to a reference distribution
60    of the same length by taking the average of each quantile across samples.
61
62    Detailed description: https://en.wikipedia.org/wiki/Quantile_normalization
63    """
64
65    def __call__(self, data) -> Table:
66        _data = data.copy()
67
68        mean = np.mean(np.sort(_data.X, axis=1), axis=0)
69        rank = rankdata(_data.X, method='average', axis=1) - 1
70
71        rank_floor = rank.astype(int)
72        rank_ceil = np.ceil(rank).astype(int)
73        _data.X = (mean.take(rank_floor) + mean.take(rank_ceil)) / 2
74
75        return _data
76